## **Dependencies to be downloaded**

In [None]:
%pip install -r ../requirements.txt
# %pip install kaggle
# !kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews -p ../Dataset/

## **Required Libraries**

In [27]:
import pandas as pd
import numpy as np
import re
import string
import zipfile

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.feature_selection import SelectKBest, chi2
from urllib.parse import urlsplit
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

from bs4 import BeautifulSoup
from langdetect import detect
from urllib.parse import urlsplit

from nltk.corpus import wordnet
# nltk.download('averaged_perceptron_tagger')

# **Set Paths**

In [28]:
# Paths for Local Run
IMDB_PREPROCESSED = r"../Datasets/IMDB_PREPROCESSED.csv"
TFIDF_FEATURES = r"../Datasets/TFIDF_FEATURES.csv"
LEXICON_POSITIVE = r"../Datasets/positive-words.txt"
LEXICON_NEGATIVE = r"../Datasets/negative-words.txt"
LEXICON_CONNOTATION = r"../Datasets/connotations.csv"
POSNEG_CONN_FEATURES = r"../Datasets/POSNEG_CONN_FEATURES.csv"
POSNEG_FEATURES = r"../Datasets/POSNEG_FEATURES.csv"
VADER_SCORES_FEATURES = r"../Datasets/VADER_SCORES_FEATURES.csv"
TARGET_VALUES = r"../Datasets/TARGET_VALUES.csv"

INFOGAIN_TFIDF_1K = r"../Datasets/INFOGAIN_TFIDF_1K.csv"
INFOGAIN_TFIDF_2K = r"../Datasets/INFOGAIN_TFIDF_2K.csv"
INFOGAIN_TFIDF_3K = r"../Datasets/INFOGAIN_TFIDF_3K.csv"
INFOGAIN_TFIDF_4K = r"../Datasets/INFOGAIN_TFIDF_4K.csv"
INFOGAIN_TFIDF_5K = r"../Datasets/INFOGAIN_TFIDF_5K.csv"
INFOGAIN_TFIDF_6K = r"../Datasets/INFOGAIN_TFIDF_6K.csv"
INFOGAIN_TFIDF_7K = r"../Datasets/INFOGAIN_TFIDF_7K.csv"
INFOGAIN_TFIDF_8K = r"../Datasets/INFOGAIN_TFIDF_8K.csv"

CHI_TFIDF_1K = r"../Datasets/CHI_TFIDF_1K.csv"
CHI_TFIDF_2K = r"../Datasets/CHI_TFIDF_2K.csv"
CHI_TFIDF_3K = r"../Datasets/CHI_TFIDF_3K.csv"
CHI_TFIDF_4K = r"../Datasets/CHI_TFIDF_4K.csv"
CHI_TFIDF_5K = r"../Datasets/CHI_TFIDF_5K.csv"
CHI_TFIDF_6K = r"../Datasets/CHI_TFIDF_6K.csv"
CHI_TFIDF_7K = r"../Datasets/CHI_TFIDF_7K.csv"
CHI_TFIDF_8K = r"../Datasets/CHI_TFIDF_8K.csv"

CORR_TFIDF_1K = r"../Datasets/CORR_TFIDF_1K.csv"
CORR_TFIDF_2K = r"../Datasets/CORR_TFIDF_2K.csv"
CORR_TFIDF_3K = r"../Datasets/CORR_TFIDF_3K.csv"
CORR_TFIDF_4K = r"../Datasets/CORR_TFIDF_4K.csv"
CORR_TFIDF_5K = r"../Datasets/CORR_TFIDF_5K.csv"
CORR_TFIDF_6K = r"../Datasets/CORR_TFIDF_6K.csv"
CORR_TFIDF_7K = r"../Datasets/CORR_TFIDF_7K.csv"
CORR_TFIDF_8K = r"../Datasets/CORR_TFIDF_8K.csv"

LDA_150 = r"../Datasets/LDA_150.csv"
LDA_200 = r"../Datasets/LDA_200.csv"
LDA_250 = r"../Datasets/LDA_250.csv"

# **Dataset Import**

In [None]:
# Specify the path to the ZIP file
zip_file_path = r"../Datasets/imdb-dataset-of-50k-movie-reviews.zip"

# Open the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Find the first file with a .csv extension (assuming it's the one you want)
    csv_file = [name for name in zip_ref.namelist() if name.endswith('.csv')][0]
    
    # Read the CSV file directly from the ZIP archive into a DataFrame
    df = pd.read_csv(zip_ref.open(csv_file))

df.head()

In [None]:
df_subset = df.sample(n=5000,random_state=42).reset_index(drop=True)
df_subset.head()

In [None]:
df_subset['sentiment'].value_counts()

# **1. Preprocessing**

In [None]:
#lowercase
df_subset["review"]=df_subset["review"].apply(lambda x:x.lower())

#Remove punctuations
df_subset['review'] = df_subset['review'].str.replace('[{}]'.format(string.punctuation), ' ')

# Remove numbers from the 'reviews' column
df_subset['review'] = df_subset['review'].str.replace(r'\d+', '')

**Stopwords**

In [None]:
# Function to remove stopwords from a text
def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

# Apply the remove_stopwords function to the 'review' column
df_subset['review'] = df_subset['review'].apply(remove_stopwords)

**URL**

In [None]:
def remove_urls(text):
    # Define a regular expression pattern to match URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')

    # Find all matches in the text
    urls = re.findall(url_pattern, text)

    # Remove URLs from the text
    text_without_urls = re.sub(url_pattern, '', text)

    return text_without_urls

# Example usage
df_subset['review'] = df_subset['review'].apply(remove_urls)

**HTML**

In [None]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

df_subset["review"] = df_subset["review"].apply(remove_html_tags)

**Non-Alphanumeric**

In [None]:
def clean_text(text):
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

df_subset['review'] = df_subset['review'].apply(clean_text)

**Extra spaces**

In [None]:
def remove_extra_whitespaces(text):
    # Use regular expression to replace multiple whitespaces with a single space
    return re.sub(r'\s+', ' ', text).strip()

df_subset['review'] = df_subset['review'].apply(remove_extra_whitespaces)

**Filter non-English comments**

In [None]:
def filter_non_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Create a boolean mask for non-English reviews
mask = df_subset['review'].apply(filter_non_english)

# Create a new DataFrame containing only English reviews
df_subset = df_subset[mask]

**Lemma**

In [None]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to get the part of speech for WordNet lemmatizer
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if the part of speech is not found

# Function to lemmatize a text
def lemmatize_text(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    return ' '.join(lemmatized_tokens)

# Apply lemmatization to the 'text' column
df_subset['review'] = df_subset['review'].apply(lemmatize_text)

In [None]:
df_subset.head()

**LabelEncoding**

In [None]:
label = LabelEncoder()
df_subset['sentiment'] = label.fit_transform(df_subset['sentiment'])

In [None]:
df_subset.head()

In [None]:
# df_subset.to_csv('../Datasets/IMDB_Preprocessed.csv',index=False)

# **2. Feature Extraction**

a. TF-IDF

In [None]:
preprocessed = pd.read_csv(IMDB_PREPROCESSED)
preprocessed

In [None]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
totalFeatures = vectorizer.fit_transform(preprocessed['review'])
column_names = vectorizer.get_feature_names_out()
totalFeatures = pd.DataFrame(totalFeatures.toarray(), columns=column_names)

totalFeatures.to_csv(TFIDF_FEATURES, index=False)

totalFeatures.head()

b. Connotations Count

In [None]:
connotations = pd.read_csv(LEXICON_CONNOTATION)

word_emotion_map = dict(zip(connotations['word'], connotations['emotion']))

def update_counts(review):
    positive_count = sum(1 for word in review.split() if word in word_emotion_map and word_emotion_map[word] == 'positive')
    negative_count = sum(1 for word in review.split() if word in word_emotion_map and word_emotion_map[word] == 'negative')
    return positive_count, negative_count

pos_neg_conn_counts_df = preprocessed['review'].apply(lambda x: pd.Series(update_counts(x), index=['Positive_Connotation_Count', 'Negative_Connotation_Count']))

pos_neg_conn_counts_df

In [None]:
# Store the Connotations Count Features
pos_neg_conn_counts_df.to_csv(POSNEG_CONN_FEATURES, index=False)

c. Positive and Negative Counts

In [None]:
# Load positive and negative words from files
positive_words_df = pd.read_csv(LEXICON_POSITIVE, header=None, names=['words'])
negative_words_df = pd.read_csv(LEXICON_NEGATIVE, header=None, names=['words'])

# Convert DataFrame columns to sets
positive_words = set(positive_words_df['words'].tolist())
negative_words = set(negative_words_df['words'].tolist())

# Define a function to update counts based on positive and negative words
def update_word_counts(review):
    positive_count = sum(1 for word in review.split() if word in positive_words)
    negative_count = sum(1 for word in review.split() if word in negative_words)
    return positive_count, negative_count

pos_neg_counts_df = preprocessed['review'].apply(lambda x: pd.Series(update_word_counts(x), index=['Positive_Word_Count', 'Negative_Word_Count']))
pos_neg_counts_df

In [None]:
pos_neg_counts_df.to_csv(POSNEG_FEATURES, index=False)

d. Vader Positive Score and Negative Score

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Use VADER for sentiment analysis
sid = SentimentIntensityAnalyzer()

def vader_sentiment(review):
    scores = sid.polarity_scores(review)
    return scores['pos'] *100, scores['neg'] * 100

vader_scores_df = preprocessed['review'].apply(lambda x: pd.Series(vader_sentiment(x), index=['Positive_VADER_Count', 'Negative_VADER_Count']))
vader_scores_df

In [None]:
vader_scores_df.to_csv(VADER_SCORES_FEATURES, index=False)

e. Topic Modeling (Latent Dirichlet Allocation)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

# Define the range of number of topics
num_topics_range = [150, 200, 250]  # Add more values if needed

# Define the file paths
file_paths = {
    150: LDA_150,
    200: LDA_200,
    250: LDA_250
}

# Loop through different numbers of topics
for num_topics in num_topics_range:
    
    lda_pipeline = make_pipeline(
        CountVectorizer(),  # CountVectorizer converts text to a matrix of token counts
        TfidfTransformer(),  # TF-IDF transformation
        LatentDirichletAllocation(n_components=num_topics, random_state=42)  # LDA for topic modeling
    )
    
    # Fit and transform data using the LDA pipeline
    X_lda = lda_pipeline.fit_transform(preprocessed['review'])
    
    # Create DataFrame for LDA features
    X_lda = pd.DataFrame(X_lda, columns=[f"Topic_{i}_{num_topics}" for i in range(num_topics)])

    # Choose the file path based on the number of topics
    file_path = file_paths[num_topics]

    # Save the DataFrame to a CSV file
    X_lda.to_csv(file_path, index=False)

    print(f"Results for {num_topics} topics saved to {file_path}")

# **3. Feature Selection**

In [None]:
set_features = pd.read_csv('../Datasets/TFIDF_Features.csv')
set_features

In [None]:
X = set_features  # Assuming 'target_column' is your target column
y_unencoded = preprocessed['sentiment']


label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_unencoded)
y = pd.DataFrame(y, columns=['sentiment'])

# Define the number of features you want to select
k_values = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]
selected_dfs = {}

y

In [None]:
y.to_csv(TARGET_VALUES, index=False)

**1. CHI SQUARE**

In [None]:
print(f"X Shape{X.shape}")
print(f"y Shape{y.shape}")

In [None]:
for k in k_values:
    selector = SelectKBest(chi2, k=k).fit(X, y)
    selected_indices = selector.get_support(indices=True)
    selected_features = X.columns[selected_indices]
    selected_dfs[f'selected_features_{k}'] = set_features[selected_features]

# Access the selected dataframes
selected_df_chi_1000 = selected_dfs['selected_features_1000']
selected_df_chi_2000 = selected_dfs['selected_features_2000']
selected_df_chi_3000 = selected_dfs['selected_features_3000']
selected_df_chi_4000 = selected_dfs['selected_features_4000']
selected_df_chi_5000 = selected_dfs['selected_features_5000']
selected_df_chi_6000 = selected_dfs['selected_features_6000']
selected_df_chi_7000 = selected_dfs['selected_features_7000']
selected_df_chi_8000 = selected_dfs['selected_features_8000']

In [None]:
# Save each dataframe to its respective CSV file
selected_df_chi_1000.to_csv(CHI_TFIDF_1K, index=False)
print(f"File saved at location: {CHI_TFIDF_1K}")
selected_df_chi_2000.to_csv(CHI_TFIDF_2K, index=False)
print(f"File saved at location: {CHI_TFIDF_2K}")
selected_df_chi_3000.to_csv(CHI_TFIDF_3K, index=False)
print(f"File saved at location: {CHI_TFIDF_3K}")
selected_df_chi_4000.to_csv(CHI_TFIDF_4K, index=False)
print(f"File saved at location: {CHI_TFIDF_4K}")
selected_df_chi_5000.to_csv(CHI_TFIDF_5K, index=False)
print(f"File saved at location: {CHI_TFIDF_5K}")
selected_df_chi_6000.to_csv(CHI_TFIDF_6K, index=False)
print(f"File saved at location: {CHI_TFIDF_6K}")
selected_df_chi_7000.to_csv(CHI_TFIDF_7K, index=False)
print(f"File saved at location: {CHI_TFIDF_7K}")
selected_df_chi_8000.to_csv(CHI_TFIDF_8K, index=False)
print(f"File saved at location: {CHI_TFIDF_8K}")

**2. CORRELATION**

In [None]:
print(f"X Shape{X.shape}")
print(f"y Shape{y.shape}")

In [None]:
# Assuming X is your features DataFrame and y is your target DataFrame
# Make sure X and y have the same number of rows

# Calculate correlation between each feature in X and the target variable in y
correlation = X.corrwith(y['sentiment'])

# Select the top correlated features for each specified number of features
selected_dfs = {}

# Sort the correlation values and get the indices of the top correlated features
selected_features_1000 = correlation.abs().nlargest(1000).index
selected_features_2000 = correlation.abs().nlargest(2000).index
selected_features_3000 = correlation.abs().nlargest(3000).index
selected_features_4000 = correlation.abs().nlargest(4000).index
selected_features_5000 = correlation.abs().nlargest(5000).index
selected_features_6000 = correlation.abs().nlargest(6000).index
selected_features_7000 = correlation.abs().nlargest(7000).index
selected_features_8000 = correlation.abs().nlargest(8000).index

# Filter the features dataframe with selected features
selected_df_1000 = X[selected_features_1000]
selected_df_2000 = X[selected_features_2000]
selected_df_3000 = X[selected_features_3000]
selected_df_4000 = X[selected_features_4000]
selected_df_5000 = X[selected_features_5000]
selected_df_6000 = X[selected_features_6000]
selected_df_7000 = X[selected_features_7000]
selected_df_8000 = X[selected_features_8000]

In [None]:
# Optionally, you can save the selected features to CSV files
selected_df_1000.to_csv(CORR_TFIDF_1K, index=False)
print(f"File saved at location: {CORR_TFIDF_1K}")
selected_df_2000.to_csv(CORR_TFIDF_2K, index=False)
print(f"File saved at location: {CORR_TFIDF_2K}")
selected_df_3000.to_csv(CORR_TFIDF_3K, index=False)
print(f"File saved at location: {CORR_TFIDF_3K}")
selected_df_4000.to_csv(CORR_TFIDF_4K, index=False)
print(f"File saved at location: {CORR_TFIDF_4K}")
selected_df_5000.to_csv(CORR_TFIDF_5K, index=False)
print(f"File saved at location: {CORR_TFIDF_5K}")
selected_df_6000.to_csv(CORR_TFIDF_6K, index=False)
print(f"File saved at location: {CORR_TFIDF_6K}")
selected_df_7000.to_csv(CORR_TFIDF_7K, index=False)
print(f"File saved at location: {CORR_TFIDF_7K}")
selected_df_8000.to_csv(CORR_TFIDF_8K, index=False)
print(f"File saved at location: {CORR_TFIDF_8K}")

**3. INFO GAIN**

In [None]:
print(f"X Shape{X.shape}")
print(f"y Shape{y.shape}")

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Set a random seed for reproducibility
np.random.seed(42)  # You can use any integer value as the seed

# Initialize a dictionary to store selected DataFrames
selected_dfs = {}

# Iterate over k_values
for k in k_values:
    selector = SelectKBest(mutual_info_classif, k=k).fit(X, y['sentiment'])
    selected_indices = selector.get_support(indices=True)
    selected_features = X.columns[selected_indices]
    selected_dfs[f'selected_features_{k}'] = set_features[selected_features]

# Access the selected dataframes
selected_df_infogain_1000 = selected_dfs['selected_features_1000']
selected_df_infogain_2000 = selected_dfs['selected_features_2000']
selected_df_infogain_3000 = selected_dfs['selected_features_3000']
selected_df_infogain_4000 = selected_dfs['selected_features_4000']
selected_df_infogain_5000 = selected_dfs['selected_features_5000']
selected_df_infogain_6000 = selected_dfs['selected_features_6000']
selected_df_infogain_7000 = selected_dfs['selected_features_7000']
selected_df_infogain_8000 = selected_dfs['selected_features_8000']

In [None]:
# Optionally, you can save these dataframes to separate files
# Save each dataframe to its respective CSV file
selected_df_infogain_1000.to_csv(INFOGAIN_TFIDF_1K, index=False)
print(f"File saved at location: {INFOGAIN_TFIDF_1K}")
selected_df_infogain_2000.to_csv(INFOGAIN_TFIDF_2K, index=False)
print(f"File saved at location: {INFOGAIN_TFIDF_2K}")
selected_df_infogain_3000.to_csv(INFOGAIN_TFIDF_3K, index=False)
print(f"File saved at location: {INFOGAIN_TFIDF_3K}")
selected_df_infogain_4000.to_csv(INFOGAIN_TFIDF_4K, index=False)
print(f"File saved at location: {INFOGAIN_TFIDF_4K}")
selected_df_infogain_5000.to_csv(INFOGAIN_TFIDF_5K, index=False)
print(f"File saved at location: {INFOGAIN_TFIDF_5K}")
selected_df_infogain_6000.to_csv(INFOGAIN_TFIDF_6K, index=False)
print(f"File saved at location: {INFOGAIN_TFIDF_6K}")
selected_df_infogain_7000.to_csv(INFOGAIN_TFIDF_7K, index=False)
print(f"File saved at location: {INFOGAIN_TFIDF_7K}")
selected_df_infogain_8000.to_csv(INFOGAIN_TFIDF_8K, index=False)
print(f"File saved at location: {INFOGAIN_TFIDF_8K}")

# **Feature Loading & Combinations**

**Load Features**

In [29]:
#############################################################################################
# Feature 1 : TF-IDF
#############################################################################################

# Type a : Information Gain
# Read CSV files into separate DataFrames
tfidf_info_gain_1k = pd.read_csv(INFOGAIN_TFIDF_1K)
tfidf_info_gain_2k = pd.read_csv(INFOGAIN_TFIDF_2K)
tfidf_info_gain_3k = pd.read_csv(INFOGAIN_TFIDF_3K)
tfidf_info_gain_4k = pd.read_csv(INFOGAIN_TFIDF_4K)
tfidf_info_gain_5k = pd.read_csv(INFOGAIN_TFIDF_5K)
tfidf_info_gain_6k = pd.read_csv(INFOGAIN_TFIDF_6K)
tfidf_info_gain_7k = pd.read_csv(INFOGAIN_TFIDF_7K)
tfidf_info_gain_8k = pd.read_csv(INFOGAIN_TFIDF_8K)

# Type b : Chi-Square
# Read CSV files into separate DataFrames
tfidf_chi_square_1k = pd.read_csv(CHI_TFIDF_1K)
tfidf_chi_square_2k = pd.read_csv(CHI_TFIDF_2K)
tfidf_chi_square_3k = pd.read_csv(CHI_TFIDF_3K)
tfidf_chi_square_4k = pd.read_csv(CHI_TFIDF_4K)
tfidf_chi_square_5k = pd.read_csv(CHI_TFIDF_5K)
tfidf_chi_square_6k = pd.read_csv(CHI_TFIDF_6K)
tfidf_chi_square_7k = pd.read_csv(CHI_TFIDF_7K)
tfidf_chi_square_8k = pd.read_csv(CHI_TFIDF_8K)

# Type c : Correlation
# Read CSV files into separate DataFrames
tfidf_correlation_1k = pd.read_csv(CORR_TFIDF_1K)
tfidf_correlation_2k = pd.read_csv(CORR_TFIDF_2K)
tfidf_correlation_3k = pd.read_csv(CORR_TFIDF_3K)
tfidf_correlation_4k = pd.read_csv(CORR_TFIDF_4K)
tfidf_correlation_5k = pd.read_csv(CORR_TFIDF_5K)
tfidf_correlation_6k = pd.read_csv(CORR_TFIDF_6K)
tfidf_correlation_7k = pd.read_csv(CORR_TFIDF_7K)
tfidf_correlation_8k = pd.read_csv(CORR_TFIDF_8K)

In [30]:
#############################################################################################
# Feature 2 : Positive and Negative Words Count
#############################################################################################

pc_nc = pd.read_csv(POSNEG_FEATURES)

In [31]:
#############################################################################################
# Feature 3 : Positive and Negative Connotation Count
#############################################################################################

pcc_ncc = pd.read_csv(POSNEG_CONN_FEATURES)

In [32]:
#############################################################################################
# Feature 4 : Vader Score
#############################################################################################

vader_scores = pd.read_csv(VADER_SCORES_FEATURES)

In [33]:
#############################################################################################
# Feature 5 : LDA Topics
#############################################################################################

lda_150_topics = pd.read_csv(LDA_150)
lda_200_topics = pd.read_csv(LDA_200)
lda_250_topics = pd.read_csv(LDA_250)

In [34]:
#############################################################################################
# Defining Target Values
#############################################################################################

y = pd.read_csv(TARGET_VALUES)
y

Unnamed: 0,sentiment
0,1
1,1
2,0
3,1
4,0
...,...
4992,1
4993,0
4994,0
4995,1


# **Release Variables**

In [None]:
# List all variables in the current namespace
%whos

In [None]:
# List all variables that are DataFrames
for variable_name, variable_value in globals().items():
    if isinstance(variable_value, pd.DataFrame):
        print(variable_name)

In [None]:
# Use This to Delete Variable
# del _
# del __
# del ___
del set_features
# del X

# **TRAIN, TEST AND VAILDATE ON MODELS**

In [35]:
# selected_features = tfidf_info_gain_1k
# selected_features = tfidf_info_gain_2k
# selected_features = tfidf_info_gain_3k
# selected_features = tfidf_info_gain_4k
# selected_features = tfidf_info_gain_5k
# selected_features = tfidf_info_gain_6k
# selected_features = tfidf_info_gain_7k
# selected_features = tfidf_info_gain_8k

# selected_features = tfidf_chi_square_1k
# selected_features = tfidf_chi_square_2k
# selected_features = tfidf_chi_square_3k
# selected_features = tfidf_chi_square_4k
# selected_features = tfidf_chi_square_5k
# selected_features = tfidf_chi_square_6k
# selected_features = tfidf_chi_square_7k
# selected_features = tfidf_chi_square_8k

# selected_features = tfidf_correlation_1k
# selected_features = tfidf_correlation_2k
# selected_features = tfidf_correlation_3k
# selected_features = tfidf_correlation_4k
# selected_features = tfidf_correlation_5k
# selected_features = tfidf_correlation_6k
# selected_features = tfidf_correlation_7k
selected_features = tfidf_correlation_8k


selected_features1 = pc_nc

selected_features2 = pcc_ncc

selected_features3 = vader_scores

# selected_features = lda_150_topics
# selected_features = lda_200_topics
selected_features4 = lda_250_topics

In [36]:
# Assuming dfs is a list of dataframes
selected_features = [selected_features,selected_features1,selected_features2,selected_features3,selected_features4]  # List of dataframes to be merged

# Merge dataframes using pd.concat()
selected_features = pd.concat(selected_features,axis=1)

# Display the merged dataframe
selected_features.head()

Unnamed: 0,bad,great,waste,nt,awful,best,love,poor,terrible,act,...,Topic_240_250,Topic_241_250,Topic_242_250,Topic_243_250,Topic_244_250,Topic_245_250,Topic_246_250,Topic_247_250,Topic_248_250,Topic_249_250
0,0.0,0.0,0.0,0.019265,0.0,0.033177,0.0,0.0,0.200227,0.0,...,0.000454,0.000454,0.000454,0.000454,0.000454,0.000454,0.000454,0.000454,0.000454,0.000454
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000341,0.000341,0.000341,0.000341,0.000341,0.000341,0.000341,0.000341,0.000341,0.000341
2,0.057397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000545,0.000545,0.000545,0.000545,0.000545,0.000545,0.000545,0.000545,0.000545,0.000545
3,0.0,0.0,0.0,0.035752,0.0,0.061571,0.0,0.0,0.0,0.0,...,0.000525,0.000525,0.000525,0.000525,0.000525,0.000525,0.000525,0.000525,0.000525,0.000525
4,0.0,0.028222,0.0,0.018718,0.047243,0.0,0.0,0.0,0.0,0.0,...,0.000399,0.000399,0.000399,0.000399,0.000399,0.000399,0.000399,0.000399,0.000399,0.000399


In [37]:
from sklearn.preprocessing import MinMaxScaler

# Assuming selected_features is your DataFrame and below are the column names which needs to be normalized

# 'Positive_Connotation_Count', 'Negative_Connotation_Count','Positive_Word_Count', 'Negative_Word_Count', 'Positive_VADER_Count', 'Negative_VADER_Count'

columns_to_normalize = ['Positive_Connotation_Count', 'Negative_Connotation_Count','Positive_Word_Count', 'Negative_Word_Count','Positive_VADER_Count', 'Negative_VADER_Count']

scaler = MinMaxScaler()
selected_features[columns_to_normalize] = scaler.fit_transform(selected_features[columns_to_normalize])

In [None]:
# Uncomment the following code if this "Unnamed: 0" column appears 
# selected_features = selected_features.drop(columns=['Unnamed: 0'])
# selected_features.head()

# **5. Classification**

**1. Support Vector Machine**

In [14]:
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.svm import SVC

# Initialize SVM classifier
svm_classifier = SVC(kernel='linear')

# Perform cross-validation predictions
y_pred = cross_val_predict(svm_classifier, selected_features, y['sentiment'], cv=5)

# Generate classification report
report = classification_report(y, y_pred)

# Print classification report
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.86      0.87      2479
           1       0.86      0.89      0.88      2518

    accuracy                           0.88      4997
   macro avg       0.88      0.87      0.88      4997
weighted avg       0.88      0.88      0.88      4997



**2. Logistic Regression**

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

# Initialize Logistic Regression classifier
logistic_regression = LogisticRegression()

# Perform cross-validation predictions
y_pred = cross_val_predict(logistic_regression, selected_features, y['sentiment'], cv=5)

# Generate classification report
report = classification_report(y, y_pred)

# Print classification report
print("Classification Report:")
print(report)

MemoryError: Unable to allocate 252. MiB for an array with shape (3997, 8256) and data type float64

**3. K-Nearest Neighbors**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

# Initialize KNN classifier with k=5 (you can adjust k as needed)
knn_classifier = KNeighborsClassifier()

# Perform cross-validation predictions
y_pred = cross_val_predict(knn_classifier, selected_features, y['sentiment'], cv=5)

# Generate classification report
report = classification_report(y, y_pred)

# Print classification report
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.44      0.51      2479
           1       0.57      0.72      0.64      2518

    accuracy                           0.58      4997
   macro avg       0.59      0.58      0.57      4997
weighted avg       0.59      0.58      0.57      4997



**4. Multinomial Naive Bayes**

In [None]:
# Multinomial Naive Bayes Classifier
nb_classifier = MultinomialNB()
# Perform cross-validation predictions
y_pred = cross_val_predict(nb_classifier, selected_features, y['sentiment'], cv=5)

# Generate classification report
report = classification_report(y, y_pred)

# Print classification report
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.92      0.89      2479
           1       0.91      0.86      0.89      2518

    accuracy                           0.89      4997
   macro avg       0.89      0.89      0.89      4997
weighted avg       0.89      0.89      0.89      4997



**5. Neural Network**

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(selected_features, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build a simple neural network model
model = Sequential()
# Existing layers
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, verbose=1)

# Evaluate the model on the test set
y_pred = (model.predict(X_test_scaled) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)

print("Neural Network Accuracy on Test Set:", accuracy)

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.6925 - loss: 0.6131
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9936 - loss: 0.0258
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9997 - loss: 0.0041
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 1.0000 - loss: 0.0011
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 1.0000 - loss: 5.0180e-04
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 1.0000 - loss: 2.7637e-04
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 1.0000 - loss: 2.3837e-04
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 1.0000 - loss: 1.6431e-04
Epoch 9/10
[1m125/125

In [None]:
# Generate classification report
report = classification_report(y_test, y_pred)

# Print classification report
print("Classification Report for Neural Network:")
print(report)

Classification Report for Neural Network:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       498
           1       0.95      0.93      0.94       502

    accuracy                           0.94      1000
   macro avg       0.94      0.94      0.94      1000
weighted avg       0.94      0.94      0.94      1000



In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assume X_df is your input DataFrame of sequential features (e.g., time series)
# and y_df is your output DataFrame (e.g., labels for binary classification)

# Convert DataFrame to numpy arrays
X = selected_features.values
y = y.values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Padding sequences if necessary (to ensure fixed length)
X_train_padded = pad_sequences(X_train)  # Use padding if sequences have variable lengths
X_test_padded = pad_sequences(X_test)    # Use the same padding for test data

# Define BiLSTM model
model = Sequential()
model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(X_train_padded.shape[1], X_train_padded.shape[2])))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model
y_pred = model.predict_classes(X_test_padded)

# Convert predictions from probabilities to binary labels
y_pred_binary = np.squeeze(y_pred)

# Generate classification report
report = classification_report(y_test, y_pred_binary)
print(report)

MemoryError: Unable to allocate 315. MiB for an array with shape (8256, 4997) and data type float64

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assume X_df is your input DataFrame of sequential features (e.g., time series)
# and y_df is your output DataFrame (e.g., labels for binary classification)

# Convert DataFrame to numpy arrays
X = selected_features.values
y = y.values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Padding sequences if necessary (to ensure fixed length)
X_train_padded = pad_sequences(X_train)  # Use padding if sequences have variable lengths
X_test_padded = pad_sequences(X_test)    # Use the same padding for test data

# Define GRU model
model = Sequential()
model.add(GRU(64, input_shape=(X_train_padded.shape[1], X_train_padded.shape[2])))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model
y_pred = model.predict_classes(X_test_padded)

# Convert predictions from probabilities to binary labels
y_pred_binary = np.squeeze(y_pred)

# Generate classification report
report = classification_report(y_test, y_pred_binary)
print(report)

AttributeError: 'numpy.ndarray' object has no attribute 'values'