## Import Necessary Libraries

In [1]:
import pandas as pd
import re
from spacing import Spacing
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample
from hazm import *

## Loading the Dataset

In [2]:
# Load the CSV file
data = pd.read_csv('taghche.csv')
# Remove duplicate rows and the ones with empty or NaN values in 'comment' or 'rate' columns
data = data.drop_duplicates()
data.dropna(subset=['comment', 'rate'], inplace=True)

# Print the first 5 rows
data.head()

Unnamed: 0,date,comment,bookname,rate,bookID,like
0,1395/11/14,اسم کتاب No one writes to the Colonel\nترجمش...,سرهنگ کسی ندارد برایش نامه بنویسد,0.0,3.0,2.0
1,1395/11/14,"طاقچه عزیز،نام کتاب""کسی به سرهنگ نامه نمینویسد...",سرهنگ کسی ندارد برایش نامه بنویسد,5.0,3.0,2.0
2,1394/06/06,بنظرم این اثر مارکز خیلی از صد سال تنهایی که ب...,سرهنگ کسی ندارد برایش نامه بنویسد,5.0,3.0,0.0
3,1393/09/02,به نظر کتاب خوبی میومد اما من از ترجمش خوشم نی...,سرهنگ کسی ندارد برایش نامه بنویسد,2.0,3.0,0.0
4,1393/06/29,کتاب خوبی است,سرهنگ کسی ندارد برایش نامه بنویسد,3.0,3.0,0.0


In [3]:
# Define a function to label the sentiment based on rating thresholds
def label_sentiment(rate, positive_threshold, neutral_threshold):
    if rate >= positive_threshold:
        return 'positive'
    elif rate >= neutral_threshold:
        return 'neutral'
    else:
        return 'negative'

## Balancing Dataset

In [4]:
# Function to prepare data and labels based on given thresholds
def prepare_data(positive_threshold, neutral_threshold):
    labeled_data = data.copy()
    labeled_data['sentiment'] = labeled_data['rate'].apply(lambda x: label_sentiment(x, positive_threshold, neutral_threshold))
    
    # Combine the features and labels into a single dataframe
    df = pd.concat([labeled_data['comment'], labeled_data['sentiment']], axis=1)

    # Separate the classes
    positive = df[df['sentiment'] == 'positive']
    neutral = df[df['sentiment'] == 'neutral']
    negative = df[df['sentiment'] == 'negative']

    # Find the minority class
    min_class_size = min(len(positive), len(neutral), len(negative))

    # Downsample the majority classes
    positive_downsampled = resample(positive, replace=False, n_samples=min_class_size, random_state=42)
    neutral_downsampled = resample(neutral, replace=False, n_samples=min_class_size, random_state=42)
    negative_downsampled = resample(negative, replace=False, n_samples=min_class_size, random_state=42)

    # Combine the downsampled classes into a single dataframe
    df_balanced = pd.concat([positive_downsampled, neutral_downsampled, negative_downsampled])

    # Shuffle the dataset
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return df_balanced['comment'], df_balanced['sentiment']

## Preprocess

In [5]:
# Define a list of Farsi stop words
def load_stop_words(file_paths):
    stop_words_set = set()
    for file_path in file_paths:
        with open('stopwords/' + file_path, 'r', encoding='utf-8') as file:
            stop_words_set.update(file.read().splitlines())
    return stop_words_set

# List of paths to your stop words files
stop_words_files = ['verbal.txt', 'persian.txt', 'short.txt', 'chars.txt', 'nonverbal.txt']

# Load stop words from files
stop_words = load_stop_words(stop_words_files)

# Define a function to preprocess and normalize the text
def preprocess(text):
    # Remove special characters
    text = re.sub(r'\|\\n|\n', ' ', text)
    # Remove non-Persian characters and digits
    text = re.sub(r'[^آ-ی\s]', ' ', text)
    # Define a regular expression pattern that matches one or more spaces
    pattern = re.compile(r" +")
    # Apply the pattern to the text and replace the matches with a single space
    text = pattern.sub(" ", text)
    # Normalize again using the function for HW1 (mostly for fixing half spaces)
    #sp = Spacing()
    #text = sp.fix(text)
    # Remove stopwords
    text = text.split()
    text = [word for word in text if word not in stop_words]
    # Join words back
    text = " ".join(text)
    return text

# Apply the function to the 'comment' column
data['comment'] = data['comment'].apply(preprocess)

# Remove duplicate rows and the ones with empty or NaN values in 'comment' or 'rate' columns
data = data.drop_duplicates()
data.dropna(subset=['comment', 'rate'], inplace=True)

data.head()

Unnamed: 0,date,comment,bookname,rate,bookID,like
0,1395/11/14,اسم کتاب ترجمش میشه سرهنگ نامه نمینویسد مترجمی...,سرهنگ کسی ندارد برایش نامه بنویسد,0.0,3.0,2.0
1,1395/11/14,طاقچه عزیز کتاب سرهنگ نامه نمینویسد متن اشتباه...,سرهنگ کسی ندارد برایش نامه بنویسد,5.0,3.0,2.0
2,1394/06/06,بنظرم اثر مارکز سال تنهایی بخاطرش نوبل ادبیات ...,سرهنگ کسی ندارد برایش نامه بنویسد,5.0,3.0,0.0
3,1393/09/02,نظر کتاب خوبی میومد ترجمش خوشم نیومد لحنش طوری...,سرهنگ کسی ندارد برایش نامه بنویسد,2.0,3.0,0.0
4,1393/06/29,کتاب خوبی,سرهنگ کسی ندارد برایش نامه بنویسد,3.0,3.0,0.0


# Don't run this cell

In [5]:
def lemmatization(text):
    # Tokenize text
    tokens = word_tokenize(text)
    # Lemmatize the tokens and then concatenate
    lemmatizer = Lemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    lemmatized_text = ' '.join(tokens)
    return lemmatized_text

# Apply the lemmatizer function to the 'comment' column
data['comment'] = data['comment'].apply(lemmatization)

data.head(10)

Unnamed: 0,date,comment,bookname,rate,bookID,like
0,1395/11/14,اسم کتاب No one writes to the Colonel ترجمش می...,سرهنگ کسی ندارد برایش نامه بنویسد,0.0,3.0,2.0
1,1395/11/14,"طاقچه عزیز ، نام کتاب "" کسی به سرهنگ نامه نمین...",سرهنگ کسی ندارد برایش نامه بنویسد,5.0,3.0,2.0
2,1394/06/06,بنظر این اثر مارکز خیلی از صد سال تنهایی که بخ...,سرهنگ کسی ندارد برایش نامه بنویسد,5.0,3.0,0.0
3,1393/09/02,به نظر کتاب خوبی میومد اما من از ترجمش خو نیوم...,سرهنگ کسی ندارد برایش نامه بنویسد,2.0,3.0,0.0
4,1393/06/29,کتاب خوبی است,سرهنگ کسی ندارد برایش نامه بنویسد,3.0,3.0,0.0
5,1393/05/02,راستش خیلی خو نیومد ازش !,سرهنگ کسی ندارد برایش نامه بنویسد,3.0,3.0,0.0
6,1393/04/11,کتابی شیوا و بینظیر یکی دیگر از شاهکار های اقا...,سرهنگ کسی ندارد برایش نامه بنویسد,5.0,3.0,0.0
7,1398/09/30,"هر هشت تای این داستان توی کتاب "" بهترین داستان...",هشت داستان کوتاه,3.0,5.0,1.0
8,1393/12/02,سلام چرا این نمونه نداره که بدون کدوم داستان ک...,هشت داستان کوتاه,1.0,5.0,1.0
9,1393/05/20,داشت#دار ب این جمع بست#بند میرسم ک مارکز هیچ ج...,هشت داستان کوتاه,2.0,5.0,1.0


In [None]:
data.to_csv('preprocessed_taghche.csv', index=False)

## TF-IDF Vectorizer - Logistic Regression

In [6]:
# Custom scorer function to handle dynamic threshold changes
def custom_scorer(estimator, X, y, positive_threshold, neutral_threshold):
    X_prepared, y_prepared = prepare_data(positive_threshold, neutral_threshold)
    X_train, X_test, y_train, y_test = train_test_split(X_prepared, y_prepared, test_size=0.1, random_state=42)
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Create a pipeline with TF-IDF and logistic regression
logReg_PL = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("logreg", LogisticRegression(max_iter=500))
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__max_features': [5000, 10000],
    'logreg__C': [0.01, 0.1, 1, 10]
}

# Custom GridSearchCV implementation to iterate over parameter grid
best_score = 0
best_params = None

# Thresholds to evaluate
rate_thresholds = [(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]

for neutral_threshold, positive_threshold in rate_thresholds:
        X_prepared, y_prepared = prepare_data(positive_threshold, neutral_threshold)
        X_train, X_test, y_train, y_test = train_test_split(X_prepared, y_prepared, test_size=0.1, random_state=42)
        
        grid_search = GridSearchCV(logReg_PL, param_grid, cv=5, scoring='accuracy')
        grid_search.fit(X_train, y_train)
        
        score = grid_search.best_score_
        if score > best_score:
            best_score = score
            best_params = grid_search.best_params_
            best_params['positive_threshold'] = positive_threshold
            best_params['neutral_threshold'] = neutral_threshold

# Print best parameters
print("Best parameters for TF-IDF model are:", best_params)

# Get the best model with the best parameters
X_prepared, y_prepared = prepare_data(best_params['positive_threshold'], best_params['neutral_threshold'])
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y_prepared, test_size=0.1, random_state=42)

best_logReg_model = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=best_params['tfidf__ngram_range'], max_features=best_params['tfidf__max_features'])),
    ("logreg", LogisticRegression(C=best_params['logreg__C'], max_iter=500))
])

best_logReg_model.fit(X_train, y_train)

# Predict the labels on the test set
y_test_pred = best_logReg_model.predict(X_test)

# Calculate the accuracy score
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print the result
print("Test accuracy of Logistic Regression model: ", test_accuracy)

# Print detailed classification report
print(classification_report(y_test, y_test_pred))

Best parameters for TF-IDF model are: {'logreg__C': 0.1, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2), 'positive_threshold': 4, 'neutral_threshold': 2}
Test accuracy of Logistic Regression model:  0.5606361829025845
              precision    recall  f1-score   support

    negative       0.53      0.59      0.56      1020
     neutral       0.52      0.52      0.52       987
    positive       0.65      0.58      0.61      1011

    accuracy                           0.56      3018
   macro avg       0.56      0.56      0.56      3018
weighted avg       0.57      0.56      0.56      3018
