In [None]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.pipeline import make_pipeline

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

from bs4 import BeautifulSoup
from langdetect import detect
from urllib.parse import urlsplit

from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')

## Pre Processing

In [None]:
df = pd.read_csv("../Dataset/IMDB Dataset.csv")
df.head()
df = df.rename(columns={'review': 'OriginalReviews'})
df = df.rename(columns={'sentiment': 'OutputSentiment'})
df_subset = df.sample(n=5000, random_state=42).reset_index(drop=True)
df_subset.head()
df_subset['OutputSentiment'].value_counts()

In [None]:
df_subset

In [None]:
#lowercase
df_subset["OriginalReviews"]=df_subset["OriginalReviews"].apply(lambda x:x.lower())

def remove_punctuation_from_text(text):
    punctuation_to_remove = string.punctuation
    translator = str.maketrans("", "", punctuation_to_remove)
    return text.translate(translator)

# Assuming df_subset is your DataFrame and 'OriginalReviews' is the column to process
df_subset['OriginalReviews'] = df_subset['OriginalReviews'].apply(remove_punctuation_from_text)

# Remove numbers from the 'OriginalReviewss' column
df_subset['OriginalReviews'] = df_subset['OriginalReviews'].str.replace('\d+', '')

In [None]:
text = "eastwood's! example, text"
punctuation_to_remove = string.punctuation
translator = str.maketrans("", "", punctuation_to_remove)

text = text.translate(translator)
print(text)


In [None]:
# Function to remove stopwords from a text
def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

# Apply the remove_stopwords function to the 'OriginalReviews' column
df_subset['OriginalReviews'] = df_subset['OriginalReviews'].apply(remove_stopwords)

In [None]:
def remove_urls(text):
    # Define a regular expression pattern to match URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')

    # Find all matches in the text
    urls = re.findall(url_pattern, text)

    # Remove URLs from the text
    text_without_urls = re.sub(url_pattern, '', text)

    return text_without_urls

# Example usage
df_subset['OriginalReviews'] = df_subset['OriginalReviews'].apply(remove_urls)

In [None]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

df_subset["OriginalReviews"] = df_subset["OriginalReviews"].apply(remove_html_tags)

In [None]:
def clean_text(text):
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

df_subset['OriginalReviews'] = df_subset['OriginalReviews'].apply(clean_text)

In [None]:
def remove_extra_whitespaces(text):
    # Use regular expression to replace multiple whitespaces with a single space
    return re.sub(r'\s+', ' ', text).strip()

df_subset['OriginalReviews'] = df_subset['OriginalReviews'].apply(remove_extra_whitespaces)

In [None]:
def filter_non_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Create a boolean mask for non-English OriginalReviewss
mask = df_subset['OriginalReviews'].apply(filter_non_english)

# Create a new DataFrame containing only English OriginalReviewss
df_subset = df_subset[mask]

In [None]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to get the part of speech for WordNet lemmatizer
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if the part of speech is not found

# Function to lemmatize a text
def lemmatize_text(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    return ' '.join(lemmatized_tokens)

# Apply lemmatization to the 'text' column
df_subset['OriginalReviews'] = df_subset['OriginalReviews'].apply(lemmatize_text)

In [None]:
df_subset

In [None]:
df_subset.to_csv("../csv/Preprocessed_data.csv",index=False)

## Feature Extraction Using TF-IDF

In [None]:
preprocessed = pd.read_csv('../csv/Preprocessed_data.csv')

In [None]:
num_features_to_keep = 8000

# Create a pipeline with TfidfVectorizer and SelectKBest
pipeline = make_pipeline(TfidfVectorizer(), SelectKBest(f_classif, k=num_features_to_keep))

# Fit and transform your data
X_transformed = pipeline.fit_transform(preprocessed['OriginalReviews'], preprocessed['OutputSentiment'])

# Get the selected feature names
selected_feature_names = pipeline.named_steps['tfidfvectorizer'].get_feature_names_out()[pipeline.named_steps['selectkbest'].get_support()]

# Create a DataFrame with the selected features
selected_features_df = pd.DataFrame(X_transformed.toarray(), columns=selected_feature_names)

# Concatenate the existing DataFrame with the new selected features DataFrame
tfidf_df_13k = pd.concat([preprocessed, selected_features_df], axis=1)

tfidf_df_13k.head()

tfidf_df_13k.to_csv("../csv/tfidf_df_13k.csv")

## CONNOTATIONS

In [None]:
# Download the VADER lexicon (run this once)
nltk.download('vader_lexicon')
delimiter = '\t'

# Read the text file into a DataFrame
positive = pd.read_csv(r'..\Connotations\positive-words.txt', sep=delimiter, names=['words'])
negative = pd.read_csv(r'..\Connotations\negative-words.txt', sep=delimiter, names=['words'])
connotations = pd.read_csv(r"..\Connotations\connotations.csv")

word_emotion_map = dict(zip(connotations['word'], connotations['emotion']))

def update_counts(review):
    positive_count = sum(1 for word in review.split() if word in word_emotion_map and word_emotion_map[word] == 'positive')
    negative_count = sum(1 for word in review.split() if word in word_emotion_map and word_emotion_map[word] == 'negative')
    return positive_count, negative_count

tfidf_df_13k[['Positive_Connotation_Count', 'Negative_Connotation_Count']] = tfidf_df_13k['OriginalReviews'].apply(update_counts).tolist()

In [None]:
# Load positive and negative words from files
positive_words_df = pd.read_csv(r'..\Connotations\positive-words.txt', header=None, names=['words'])
negative_words_df = pd.read_csv(r'..\Connotations\negative-words.txt', header=None, names=['words'])

# Convert DataFrame columns to sets
positive_words = set(positive_words_df['words'].tolist())
negative_words = set(negative_words_df['words'].tolist())

# Assuming 'tfidf_df_13k' is your DataFrame
# Define a function to update counts based on positive and negative words
def update_word_counts(review):
    positive_count = sum(1 for word in review.split() if word in positive_words)
    negative_count = sum(1 for word in review.split() if word in negative_words)
    return positive_count, negative_count

# Apply the function to the 'OriginalReviews' column and unpack the result into two new columns
tfidf_df_13k[['Positive_Word_Count', 'Negative_Word_Count']] = tfidf_df_13k['OriginalReviews'].apply(update_word_counts).tolist()

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
# Use VADER for sentiment analysis
sid = SentimentIntensityAnalyzer()

def vader_sentiment(review):
    scores = sid.polarity_scores(review)
    return scores['pos'] *100, scores['neg'] * 100

# Apply the function to the 'OriginalReviews' column and unpack the result into two new columns
tfidf_df_13k[['Positive_VADER_Count', 'Negative_VADER_Count']] = tfidf_df_13k['OriginalReviews'].apply(vader_sentiment).tolist()

tfidf_df_13k.to_csv("../csv/tfidf_df_13k_connotations_vader.csv")

In [None]:
# text = tfidf_df_13k.iloc[4993]['OriginalReviews']

In [None]:
#tfidf_df_13k = pd.read_csv("../csv/tfidf_df_13k.csv")

In [None]:
tfidf_df_13k_connotations = pd.read_csv('../csv/tfidf_df_13k_connotations_vader.csv')

In [None]:
tfidf_df_13k_connotations = tfidf_df_13k_connotations.drop('Unnamed: 0',axis=1)

In [None]:
tfidf_df_13k_connotations = tfidf_df_13k

In [None]:
tfidf_df_13k_connotations

In [None]:
df_statistical = tfidf_df_13k_connotations.drop(columns=['OriginalReviews','Positive_Connotation_Count','Negative_Connotation_Count','Positive_Word_Count','Negative_Word_Count','Positive_VADER_Count','Negative_VADER_Count'], axis=1)
df_statistical.head()

In [None]:
label = LabelEncoder()
df_statistical['OutputSentiment'] = label.fit_transform(df_statistical['OutputSentiment'])

## CHI SQAURE

In [None]:
# # This will get the top 5000 relavant features out of the sample
# chi2_selector = SelectKBest(chi2, k=5000)

# # This will transform the dataset i.e, it will reduce the dimensions by just considering the relavant features only
# X = df_statistical.drop(columns=['OutputSentiment'])
# y = df_statistical['OutputSentiment']
# X_5000 = chi2_selector.fit_transform(X, y)

# # Get the indices of the selected features
# selected_feature_indices = chi2_selector.get_support(indices=True)

# # Get the names of the selected features
# selected_feature_names = X.columns[selected_feature_indices]

# chisq_5k = X[selected_feature_names]
# chisq_5k.head()

# chisq_5k = pd.concat([chisq_5k,tfidf_df_13k_connotations.iloc[:, -6:]],axis=1)
# chisq_5k.head()

In [None]:
# # This will get the top 8000 relavant features out of the sample
# chi2_selector = SelectKBest(chi2, k=8000)

# # This will transform the dataset i.e, it will reduce the dimensions by just considering the relavant features only
# X = df_statistical.drop(columns=['OutputSentiment'])
# y = df_statistical['OutputSentiment']
# X_8000 = chi2_selector.fit_transform(X, y)

# # Get the indices of the selected features
# selected_feature_indices = chi2_selector.get_support(indices=True)

# # Get the names of the selected features
# selected_feature_names = X.columns[selected_feature_indices]

# chisq_8k = X[selected_feature_names]
# chisq_8k.head()

# chisq_8k = pd.concat([chisq_8k,tfidf_df_13k_connotations.iloc[:, -6:]],axis=1)
# chisq_8k.head()

In [None]:
# from sklearn.feature_selection import SelectKBest, f_regression

# # For 5000 relevant features
# cor_selector_5k = SelectKBest(f_regression, k=5000)

# # Transform the dataset to reduce dimensions by considering only the relevant features
# X = df_statistical.drop(columns=['OutputSentiment'])
# y = df_statistical['OutputSentiment']
# X_5000 = cor_selector_5k.fit_transform(X, y)

# # Get the indices of the selected features
# selected_feature_indices_5k = cor_selector_5k.get_support(indices=True)

# # Get the names of the selected features
# selected_feature_names_5k = X.columns[selected_feature_indices_5k]

# cor_5k = X[selected_feature_names_5k]
# cor_5k.head()

# cor_5k = pd.concat([cor_5k, tfidf_df_13k_connotations.iloc[:, -6:]], axis=1)
# cor_5k.head()

# # For 8000 relevant features
# cor_selector_8k = SelectKBest(f_regression, k=8000)

# # Transform the dataset to reduce dimensions by considering only the relevant features
# X = df_statistical.drop(columns=['OutputSentiment'])
# y = df_statistical['OutputSentiment']
# X_8000 = cor_selector_8k.fit_transform(X, y)

# # Get the indices of the selected features
# selected_feature_indices_8k = cor_selector_8k.get_support(indices=True)

# # Get the names of the selected features
# selected_feature_names_8k = X.columns[selected_feature_indices_8k]

# cor_8k = X[selected_feature_names_8k]
# cor_8k.head()

# cor_8k = pd.concat([cor_8k, tfidf_df_13k_connotations.iloc[:, -6:]], axis=1)
# cor_8k.head()

In [None]:
# cor_8k

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# # Assuming chisq_8k is your DataFrame
# columns_to_normalize = ['Positive_Connotation_Count', 'Negative_Connotation_Count', 
#                          'Positive_Word_Count', 'Negative_Word_Count', 
#                          'Positive_VADER_Count', 'Negative_VADER_Count']

# scaler = MinMaxScaler()
# cor_8k[columns_to_normalize] = scaler.fit_transform(cor_8k[columns_to_normalize])

In [None]:
# cor_8k

## SVM-RFE

In [None]:
import numpy as np
import dask.array as da
from dask.distributed import Client, LocalCluster
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define PCA-based feature selection
# Define PCA-based feature selection
def pca_feature_selection_chunk(X_chunk, n_selected_features):
    # Normalize the data
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X_chunk)

    # Perform PCA
    pca = PCA(n_components=n_selected_features)
    X_pca = pca.fit_transform(X_normalized)

    # Get the indices of the selected features based on explained variance ratio
    selected_features = np.argsort(pca.explained_variance_ratio_)[-n_selected_features:]
    selected_features = np.ravel(selected_features)

    return selected_features


# Perform PCA-based feature selection using Dask
def dask_pca_feature_selection(X, n_selected_features):
    n_components = min(X.shape[0], X.shape[1], n_selected_features)
    
    print("Dimensions of the dataset:")
    print("Number of samples:", X.shape[0])
    print("Number of features:", X.shape[1])
    print("Computed value of n_components:", n_components)
    
    # Create a Dask array from the input data
    X_dask = da.from_array(X, chunks=X.shape[0] // num_processes)

    # Map PCA-based feature selection to each chunk of data
    selected_features_dask = X_dask.map_blocks(pca_feature_selection_chunk, dtype=np.int32, n_selected_features=n_components)

    # Compute the result
    selected_features = selected_features_dask.compute()

    return selected_features




# Create a Dask cluster and client
cluster = LocalCluster()  # Use a local cluster for parallel processing
client = Client(cluster)

# Assuming X is your full dataset
# dataset
X = df_statistical.drop(columns=['OutputSentiment']).values

# Get the number of processes
num_processes = len(cluster.workers)

# Perform PCA-based feature selection in parallel using Dask
selected_features = dask_pca_feature_selection(X, n_selected_features=120)

# Create a DataFrame of selected features
selected_features_df = pd.DataFrame(data=X[:, selected_features], columns=[f"Feature_{i+1}" for i in range(len(selected_features))])

# Close the Dask client and cluster
client.close()
cluster.close()

In [None]:
SVM_rfe

## CLASSIFICATION

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

# # Multinomial Naive Bayes Classifier
# nb_classifier = MultinomialNB()
# nb_scores = cross_val_score(nb_classifier, SVM_rfe, y, cv=5)

# print("Multinomial Naive Bayes Cross-Validation Scores:")
# print(nb_scores)
# print("Mean Accuracy:", np.mean(nb_scores))

# k-Nearest Neighbors Classifier
knn_classifier = KNeighborsClassifier()
knn_scores = cross_val_score(knn_classifier, X, y,selected_features, cv=5)

print("\nk-Nearest Neighbors Cross-Validation Scores:")
print(knn_scores)
print("Mean Accuracy:", np.mean(knn_scores))

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Load your data
# Assuming X and y are your features and target variables

# Initialize models
svm_model = SVC(kernel='linear')  # Linear SVM
logistic_model = LogisticRegression()

# Initialize KFold cross-validation
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

# Perform 5-fold cross-validation for SVM
svm_scores = cross_val_score(svm_model, X, y,selected_features, cv=kfold)

# Perform 5-fold cross-validation for Logistic Regression
logistic_scores = cross_val_score(logistic_model, X, y,selected_features, cv=kfold)

# Display the cross-validation scores
print("SVM Cross-validation scores:", svm_scores)
print("Logistic Regression Cross-validation scores:", logistic_scores)

# Optionally, you can calculate mean and standard deviation of the scores
print("SVM Mean Accuracy:", np.mean(svm_scores))
print("SVM Standard Deviation of Accuracy:", np.std(svm_scores))
print("Logistic Regression Mean Accuracy:", np.mean(logistic_scores))
print("Logistic Regression Standard Deviation of Accuracy:", np.std(logistic_scores))

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import StratifiedKFold

# Assuming chisq_8k has features and y is the output

# Encode categorical labels if needed
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded,selected_features, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build a simple neural network model
model = Sequential()
model.add(Dense(128, input_dim=X_train_scaled.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, verbose=1)

# Evaluate the model on the test set
y_pred = (model.predict(X_test_scaled) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)

print("Neural Network Accuracy on Test Set:", accuracy)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

# Assuming chisq_8k has features and y is the output

# Encode categorical labels if needed
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded,selected_features, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 
# 
# 
# This is Custom Optimizer
# 
# 
# 

optimizer = tf.keras.optimizers.experimental.Adagrad(
    learning_rate=0.1,
    initial_accumulator_value=0.1,
    epsilon=1e-07,
    weight_decay=0.001,
    clipnorm=None,
    clipvalue=None,
    global_clipnorm=None,
    use_ema=False,
    ema_momentum=0.99,
    ema_overwrite_frequency=None,
    jit_compile=True,
    name='Adagrad',
)


# # Build a simple neural network model
# model = ""
# model = Sequential()
# model.add(Dense(128, input_dim=X_train_scaled.shape[1], activation='relu'))
# model.add(Dense(64, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(
    X_train_scaled, y_train, 
    epochs=50, batch_size=32, 
    validation_split=0.15,  # Using a portion of training set for validation
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate the model on the test set
y_pred = (model.predict(X_test_scaled) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)

print("Neural Network Accuracy on Test Set:", accuracy)
