In [1]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, chi2

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Pre Processing

In [2]:
'''df = pd.read_csv("../Dataset/IMDB Dataset.csv")
df.head()
df = df.rename(columns={'review': 'OriginalReviews'})
df = df.rename(columns={'sentiment': 'OutputSentiment'})
df_subset = df.sample(n=5000).reset_index(drop=True)
df_subset.head()
df_subset['OutputSentiment'].value_counts()

def remove_numbers_from_column(text):
    return re.sub(r'\d+', '', text)

df_subset['OriginalReviews'] = df_subset['OriginalReviews'].apply(remove_numbers_from_column)

def remove_punc(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df_subset['OriginalReviews'] = df_subset['OriginalReviews'].apply(remove_punc)

sw = stopwords.words('english')
df_subset['OriginalReviews'] = df_subset['OriginalReviews'].apply(lambda x:[word for word in x.split() if word not in sw]).apply(lambda x:" ".join(x))

# Lemmatization function
def lemmatize_column(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_words)

# Apply the function to the specific column
df_subset['OriginalReviews'] = df_subset['OriginalReviews'].apply(lemmatize_column)

df_subset.to_csv("../csv/Preprocessed_data.csv",index=False)'''

'df = pd.read_csv("../Dataset/IMDB Dataset.csv")\ndf.head()\ndf = df.rename(columns={\'review\': \'OriginalReviews\'})\ndf = df.rename(columns={\'sentiment\': \'OutputSentiment\'})\ndf_subset = df.sample(n=5000).reset_index(drop=True)\ndf_subset.head()\ndf_subset[\'OutputSentiment\'].value_counts()\n\ndef remove_numbers_from_column(text):\n    return re.sub(r\'\\d+\', \'\', text)\n\ndf_subset[\'OriginalReviews\'] = df_subset[\'OriginalReviews\'].apply(remove_numbers_from_column)\n\ndef remove_punc(text):\n    return text.translate(str.maketrans(\'\', \'\', string.punctuation))\n\ndf_subset[\'OriginalReviews\'] = df_subset[\'OriginalReviews\'].apply(remove_punc)\n\nsw = stopwords.words(\'english\')\ndf_subset[\'OriginalReviews\'] = df_subset[\'OriginalReviews\'].apply(lambda x:[word for word in x.split() if word not in sw]).apply(lambda x:" ".join(x))\n\n# Lemmatization function\ndef lemmatize_column(text):\n    lemmatizer = WordNetLemmatizer()\n    tokens = word_tokenize(text)\n    lem

## Feature Extraction Using TF-IDF

In [3]:
'''num_features_to_keep = 13000

# Create a pipeline with TfidfVectorizer and SelectKBest
pipeline = make_pipeline(TfidfVectorizer(), SelectKBest(f_classif, k=num_features_to_keep))

# Fit and transform your data
X_transformed = pipeline.fit_transform(preprocessed['OriginalReviews'], preprocessed['OutputSentiment'])

# Get the selected feature names
selected_feature_names = pipeline.named_steps['tfidfvectorizer'].get_feature_names_out()[pipeline.named_steps['selectkbest'].get_support()]

# Create a DataFrame with the selected features
selected_features_df = pd.DataFrame(X_transformed.toarray(), columns=selected_feature_names)

# Concatenate the existing DataFrame with the new selected features DataFrame
tfidf_df_13k = pd.concat([preprocessed, selected_features_df], axis=1)

tfidf_df_13k.head()

tfidf_df_13k.to_csv("../csv/tfidf_df_13k.csv")'''

'num_features_to_keep = 13000\n\n# Create a pipeline with TfidfVectorizer and SelectKBest\npipeline = make_pipeline(TfidfVectorizer(), SelectKBest(f_classif, k=num_features_to_keep))\n\n# Fit and transform your data\nX_transformed = pipeline.fit_transform(preprocessed[\'OriginalReviews\'], preprocessed[\'OutputSentiment\'])\n\n# Get the selected feature names\nselected_feature_names = pipeline.named_steps[\'tfidfvectorizer\'].get_feature_names_out()[pipeline.named_steps[\'selectkbest\'].get_support()]\n\n# Create a DataFrame with the selected features\nselected_features_df = pd.DataFrame(X_transformed.toarray(), columns=selected_feature_names)\n\n# Concatenate the existing DataFrame with the new selected features DataFrame\ntfidf_df_13k = pd.concat([preprocessed, selected_features_df], axis=1)\n\ntfidf_df_13k.head()\n\ntfidf_df_13k.to_csv("../csv/tfidf_df_13k.csv")'

## CONNOTATIONS

In [4]:
'''delimiter = '\t'

# Read the text file into a DataFrame
positive = pd.read_csv('../Connotations/positive-words.txt', delimiter,names=['words'])
negative = pd.read_csv('../Connotations/negative-words.txt', delimiter,names=['words'])
connotations = pd.read_csv("../Connotations/connotations.csv")

word_emotion_map = dict(zip(connotations['word'], connotations['emotion']))

def update_counts(review):
    positive_count = sum(1 for word in review.split() if word in word_emotion_map and word_emotion_map[word] == 'positive')
    negative_count = sum(1 for word in review.split() if word in word_emotion_map and word_emotion_map[word] == 'negative')
    return positive_count, negative_count

tfidf_df_13k[['Positive_Connotation_Count', 'Negative_Connotation_Count']] = tfidf_df_13k['OriginalReviews'].apply(update_counts).tolist()

# Load positive and negative words from files
positive_words = set(pd.read_csv('../Connotations/positive-words.txt', header=None, squeeze=True).tolist())
negative_words = set(pd.read_csv('../Connotations/negative-words.txt', header=None, squeeze=True).tolist())

# Assuming 'tfidf_df_13k' is your DataFrame

# Define a function to update counts based on positive and negative words
def update_word_counts(review):
    positive_count = sum(1 for word in review.split() if word in positive_words)
    negative_count = sum(1 for word in review.split() if word in negative_words)
    return positive_count, negative_count

# Apply the function to the 'OriginalReviews' column and unpack the result into two new columns
tfidf_df_13k[['Positive_Word_Count', 'Negative_Word_Count']] = tfidf_df_13k['OriginalReviews'].apply(update_word_counts).tolist()
tfidf_df_13k.to_csv("../csv/tfidf_df_13k_connotations.csv")'''


'delimiter = \'\t\'\n\n# Read the text file into a DataFrame\npositive = pd.read_csv(\'../Connotations/positive-words.txt\', delimiter,names=[\'words\'])\nnegative = pd.read_csv(\'../Connotations/negative-words.txt\', delimiter,names=[\'words\'])\nconnotations = pd.read_csv("../Connotations/connotations.csv")\n\nword_emotion_map = dict(zip(connotations[\'word\'], connotations[\'emotion\']))\n\ndef update_counts(review):\n    positive_count = sum(1 for word in review.split() if word in word_emotion_map and word_emotion_map[word] == \'positive\')\n    negative_count = sum(1 for word in review.split() if word in word_emotion_map and word_emotion_map[word] == \'negative\')\n    return positive_count, negative_count\n\ntfidf_df_13k[[\'Positive_Connotation_Count\', \'Negative_Connotation_Count\']] = tfidf_df_13k[\'OriginalReviews\'].apply(update_counts).tolist()\n\n# Load positive and negative words from files\npositive_words = set(pd.read_csv(\'../Connotations/positive-words.txt\', header=N

In [5]:
preprocessed = pd.read_csv('../csv/Preprocessed_data.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../csv/Preprocessed_data.csv'

In [None]:
tfidf_df_13k = pd.read_csv("../csv/tfidf_df_13k.csv")

In [None]:
tfidf_df_13k_connotations = pd.read_csv('../csv/tfidf_df_13k_connotations.csv')

In [None]:
tfidf_df_13k_connotations = tfidf_df_13k_connotations.drop('Unnamed: 0',axis=1)

In [None]:
df_statistical = tfidf_df_13k_connotations.drop(columns=['OriginalReviews','Positive_Connotation_Count','Negative_Connotation_Count','Positive_Word_Count','Negative_Word_Count'], axis=1)
df_statistical.head()

In [None]:
label = LabelEncoder()
df_statistical['OutputSentiment'] = label.fit_transform(df_statistical['OutputSentiment'])

## CHI SQAURE

In [None]:
# This will get the top 5000 relavant features out of the sample
chi2_selector = SelectKBest(chi2, k=5000)

# This will transform the dataset i.e, it will reduce the dimensions by just considering the relavant features only
X = df_statistical.drop(columns=['OutputSentiment'])
y = df_statistical['OutputSentiment']
X_5000 = chi2_selector.fit_transform(X, y)

# Get the indices of the selected features
selected_feature_indices = chi2_selector.get_support(indices=True)

# Get the names of the selected features
selected_feature_names = X.columns[selected_feature_indices]

chisq_5k = X[selected_feature_names]
chisq_5k.head()

chisq_5k = pd.concat([chisq_5k,tfidf_df_13k_connotations.iloc[:, -4:]],axis=1)
chisq_5k.head()

In [None]:
# This will get the top 5000 relavant features out of the sample
chi2_selector = SelectKBest(chi2, k=8000)

# This will transform the dataset i.e, it will reduce the dimensions by just considering the relavant features only
X = df_statistical.drop(columns=['OutputSentiment'])
y = df_statistical['OutputSentiment']
X_8000 = chi2_selector.fit_transform(X, y)

# Get the indices of the selected features
selected_feature_indices = chi2_selector.get_support(indices=True)

# Get the names of the selected features
selected_feature_names = X.columns[selected_feature_indices]

chisq_8k = X[selected_feature_names]
chisq_8k.head()

chisq_8k = pd.concat([chisq_8k,tfidf_df_13k_connotations.iloc[:, -4:]],axis=1)
chisq_8k.head()

## CORRELATION

In [None]:
import numpy as np

# Assuming 'target' is your target variable
target_variable = 'OutputSentiment'

# Calculate the correlation matrix
correlation_matrix = df_statistical.corr()

# Extract the correlation with the target variable
correlation_with_target = correlation_matrix[target_variable].abs()

# Select the top 2000 features based on correlation with the target variable
top_2000_features = correlation_with_target.nlargest(2000).index

corr_2k = df_statistical[top_2000_features]

# Display the top 2000 features
corr_2k.head()

# Select the top 5000 features based on correlation with the target variable
top_5000_features = correlation_with_target.nlargest(5000).index

corr_5k = df_statistical[top_5000_features]

# Display the top 2000 features
corr_5k.head()

In [None]:
# Concatenate the DataFrames along the columns axis
corr_2k = pd.concat([corr_2k, tfidf_df_13k_connotations.iloc[:, -4:]], axis=1)
corr_5k = pd.concat([corr_5k ,tfidf_df_13k_connotations.iloc[:, -4:]], axis=1)

if 'OutputSentiment' in corr_2k.columns:
    corr_2k = corr_2k.drop('OutputSentiment', axis=1)
if 'OutputSentiment' in corr_5k.columns:
    corr_5k = corr_5k.drop('OutputSentiment', axis=1)

## CHI SQAURE CLASSIFICATION

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

# Multinomial Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_scores = cross_val_score(nb_classifier, chisq_8k, y, cv=5)

print("Multinomial Naive Bayes Cross-Validation Scores:")
print(nb_scores)
print("Mean Accuracy:", np.mean(nb_scores))

# k-Nearest Neighbors Classifier
knn_classifier = KNeighborsClassifier()
knn_scores = cross_val_score(knn_classifier, chisq_5k, y, cv=5)

print("\nk-Nearest Neighbors Cross-Validation Scores:")
print(knn_scores)
print("Mean Accuracy:", np.mean(knn_scores))

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Load your data
# Assuming X and y are your features and target variables

# Initialize models
svm_model = SVC(kernel='linear')  # Linear SVM
logistic_model = LogisticRegression()

# Initialize KFold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform 5-fold cross-validation for SVM
svm_scores = cross_val_score(svm_model, chisq_5k, y, cv=kfold)

# Perform 5-fold cross-validation for Logistic Regression
logistic_scores = cross_val_score(logistic_model, chisq_8k, y, cv=kfold)

# Display the cross-validation scores
print("SVM Cross-validation scores:", svm_scores)
print("Logistic Regression Cross-validation scores:", logistic_scores)

# Optionally, you can calculate mean and standard deviation of the scores
print("SVM Mean Accuracy:", np.mean(svm_scores))
print("SVM Standard Deviation of Accuracy:", np.std(svm_scores))
print("Logistic Regression Mean Accuracy:", np.mean(logistic_scores))
print("Logistic Regression Standard Deviation of Accuracy:", np.std(logistic_scores))

## CORRELATION CLASSIFICATION

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

# Multinomial Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_scores = cross_val_score(nb_classifier, corr_2k, y, cv=5)

print("Multinomial Naive Bayes Cross-Validation Scores:")
print(nb_scores)
print("Mean Accuracy:", np.mean(nb_scores))

# k-Nearest Neighbors Classifier
knn_classifier = KNeighborsClassifier()
knn_scores = cross_val_score(knn_classifier, corr_2k, y, cv=5)

print("\nk-Nearest Neighbors Cross-Validation Scores:")
print(knn_scores)
print("Mean Accuracy:", np.mean(knn_scores))

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Load your data
# Assuming X and y are your features and target variables

# Initialize models
svm_model = SVC(kernel='linear')  # Linear SVM
logistic_model = LogisticRegression()

# Initialize KFold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform 5-fold cross-validation for SVM
svm_scores = cross_val_score(svm_model, corr_5k, y, cv=kfold)

# Perform 5-fold cross-validation for Logistic Regression
logistic_scores = cross_val_score(logistic_model, corr_5k, y, cv=kfold)

# Display the cross-validation scores
print("SVM Cross-validation scores:", svm_scores)
print("Logistic Regression Cross-validation scores:", logistic_scores)

# Optionally, you can calculate mean and standard deviation of the scores
print("SVM Mean Accuracy:", np.mean(svm_scores))
print("SVM Standard Deviation of Accuracy:", np.std(svm_scores))
print("Logistic Regression Mean Accuracy:", np.mean(logistic_scores))
print("Logistic Regression Standard Deviation of Accuracy:", np.std(logistic_scores))