In [1]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.pipeline import make_pipeline

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

from bs4 import BeautifulSoup
from langdetect import detect
from urllib.parse import urlsplit

from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sdidd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sdidd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sdidd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sdidd\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Pre Processing

In [2]:
df = pd.read_csv("../Dataset/IMDB Dataset.csv")
df.head()
df = df.rename(columns={'review': 'OriginalReviews'})
df = df.rename(columns={'sentiment': 'OutputSentiment'})
df_subset = df.sample(n=5000, random_state=42).reset_index(drop=True)
df_subset.head()
df_subset['OutputSentiment'].value_counts()

OutputSentiment
positive    2519
negative    2481
Name: count, dtype: int64

In [3]:
df_subset

Unnamed: 0,OriginalReviews,OutputSentiment
0,I really liked this Summerslam due to the look...,positive
1,Not many television shows appeal to quite as m...,positive
2,The film quickly gets to a major chase scene w...,negative
3,Jane Austen would definitely approve of this o...,positive
4,Expectations were somewhat high for me when I ...,negative
...,...,...
4995,One of eastwood's best movies after he had sep...,positive
4996,My blurred childhood memories have kept the ec...,negative
4997,I love Zombie-Movies and I love amateur-produc...,negative
4998,Chan is in New York and he gets involved with ...,positive


In [4]:
#lowercase
df_subset["OriginalReviews"]=df_subset["OriginalReviews"].apply(lambda x:x.lower())

# Function to remove punctuation from text
def remove_punctuation_from_text(text):
    punctuation_to_remove = string.punctuation
    translator = str.maketrans("", "", punctuation_to_remove)
    return text.translate(translator)

# Remove punctuation and count punctuation in each input text
def count_punctuation(text):
    text_without_punctuation = remove_punctuation_from_text(text)
    punctuation_count = len(text) - len(text_without_punctuation)
    return punctuation_count

# Apply the function to count punctuation and add as a new column
df_subset['PunctuationCount'] = df_subset['OriginalReviews'].apply(count_punctuation)

# Remove numbers from the 'OriginalReviewss' column
df_subset['OriginalReviews'] = df_subset['OriginalReviews'].str.replace('\d+', '')

print(df_subset)


                                        OriginalReviews OutputSentiment  \
0     i really liked this summerslam due to the look...        positive   
1     not many television shows appeal to quite as m...        positive   
2     the film quickly gets to a major chase scene w...        negative   
3     jane austen would definitely approve of this o...        positive   
4     expectations were somewhat high for me when i ...        negative   
...                                                 ...             ...   
4995  one of eastwood's best movies after he had sep...        positive   
4996  my blurred childhood memories have kept the ec...        negative   
4997  i love zombie-movies and i love amateur-produc...        negative   
4998  chan is in new york and he gets involved with ...        positive   
4999  my wife and i both thought this film a watered...        negative   

      PunctuationCount  
0                   23  
1                   72  
2                   22  

In [5]:
df_subset

Unnamed: 0,OriginalReviews,OutputSentiment,PunctuationCount
0,i really liked this summerslam due to the look...,positive,23
1,not many television shows appeal to quite as m...,positive,72
2,the film quickly gets to a major chase scene w...,negative,22
3,jane austen would definitely approve of this o...,positive,50
4,expectations were somewhat high for me when i ...,negative,86
...,...,...,...
4995,one of eastwood's best movies after he had sep...,positive,4
4996,my blurred childhood memories have kept the ec...,negative,52
4997,i love zombie-movies and i love amateur-produc...,negative,51
4998,chan is in new york and he gets involved with ...,positive,27


In [6]:

# NLTK stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords from text and count them
def remove_stopwords_and_count(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    stopwords_count = len(tokens) - len(filtered_tokens)
    return stopwords_count

# Apply the function to remove stopwords and count them, then add as a new column
df_subset['StopwordsCount'] = df_subset['OriginalReviews'].apply(remove_stopwords_and_count)

In [7]:
df_subset

Unnamed: 0,OriginalReviews,OutputSentiment,PunctuationCount,StopwordsCount
0,i really liked this summerslam due to the look...,positive,23,84
1,not many television shows appeal to quite as m...,positive,72,162
2,the film quickly gets to a major chase scene w...,negative,22,54
3,jane austen would definitely approve of this o...,positive,50,39
4,expectations were somewhat high for me when i ...,negative,86,166
...,...,...,...,...
4995,one of eastwood's best movies after he had sep...,positive,4,24
4996,my blurred childhood memories have kept the ec...,negative,52,59
4997,i love zombie-movies and i love amateur-produc...,negative,51,56
4998,chan is in new york and he gets involved with ...,positive,27,88


In [8]:
def remove_urls(text):
    # Define a regular expression pattern to match URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')

    # Find all matches in the text
    urls = re.findall(url_pattern, text)

    # Remove URLs from the text
    text_without_urls = re.sub(url_pattern, '', text)

    return text_without_urls

# Example usage
df_subset['OriginalReviews'] = df_subset['OriginalReviews'].apply(remove_urls)

In [9]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

df_subset["OriginalReviews"] = df_subset["OriginalReviews"].apply(remove_html_tags)

  soup = BeautifulSoup(text, 'html.parser')


In [10]:
def clean_text(text):
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

df_subset['OriginalReviews'] = df_subset['OriginalReviews'].apply(clean_text)

In [11]:
def remove_extra_whitespaces(text):
    # Use regular expression to replace multiple whitespaces with a single space
    return re.sub(r'\s+', ' ', text).strip()

df_subset['OriginalReviews'] = df_subset['OriginalReviews'].apply(remove_extra_whitespaces)

In [12]:
def filter_non_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Create a boolean mask for non-English OriginalReviewss
mask = df_subset['OriginalReviews'].apply(filter_non_english)

# Create a new DataFrame containing only English OriginalReviewss
df_subset = df_subset[mask]

In [13]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to get the part of speech for WordNet lemmatizer
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if the part of speech is not found

# Function to lemmatize a text
def lemmatize_text(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    return ' '.join(lemmatized_tokens)

# Apply lemmatization to the 'text' column
df_subset['OriginalReviews'] = df_subset['OriginalReviews'].apply(lemmatize_text)

In [27]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn

def count_word_types(text):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    
    adj_count = 0
    adv_count = 0
    temporal_count = 0
    
    for word, tag in tagged_tokens:
        if tag.startswith('JJ'):  # Adjective
            adj_count += 1
        elif tag.startswith('RB'):  # Adverb
            adv_count += 1
            
    return {'AdjectiveCount': adj_count, 'AdverbCount': adv_count}

# Apply the function to count word types and add as new columns
df_counts = df_subset['OriginalReviews'].apply(count_word_types).apply(pd.Series)

# Concatenate the counts DataFrame with the original DataFrame
df_subset = pd.concat([df_subset, df_counts], axis=1)


In [28]:
df_subset

Unnamed: 0,OriginalReviews,OutputSentiment,PunctuationCount,StopwordsCount,AdjectiveCount,AdverbCount,TemporalWordCount,AdjectiveCount.1,AdverbCount.1
0,i really like this summerslam due to the look ...,positive,23,84,22,14,0,22,14
1,not many television show appeal to quite a man...,positive,72,162,39,20,0,39,20
2,the film quickly get to a major chase scene wi...,negative,22,54,8,8,0,8,8
3,jane austen would definitely approve of this o...,positive,50,39,13,13,0,13,13
4,expectation be somewhat high for me when i go ...,negative,86,166,33,14,0,33,14
...,...,...,...,...,...,...,...,...,...
4995,one of eastwoods best movie after he have sepa...,positive,4,24,4,2,0,4,2
4996,my blur childhood memory have keep the echo of...,negative,52,59,19,7,0,19,7
4997,i love zombiemovies and i love amateurproducti...,negative,51,56,12,12,0,12,12
4998,chan be in new york and he get involve with an...,positive,27,88,15,17,0,15,17


In [16]:
df_subset.to_csv("../csv/Preprocessed_data.csv",index=False)

## Feature Extraction Using TF-IDF

In [95]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Define the number of topics for LDA
num_topics = 150  # You can adjust this number based on your requirements

# Create a pipeline for TF-IDF features
tfidf_pipeline = make_pipeline(
    CountVectorizer(),  # CountVectorizer converts text to a matrix of token counts
    TfidfTransformer()  # TF-IDF transformation
)

# Create a pipeline for LDA
lda_pipeline = make_pipeline(
    CountVectorizer(),  # CountVectorizer converts text to a matrix of token counts
    TfidfTransformer(),  # TF-IDF transformation
    LatentDirichletAllocation(n_components=num_topics, random_state=42)  # LDA for topic modeling
)

# Fit and transform data separately for TF-IDF and LDA
X_tfidf = tfidf_pipeline.fit_transform(df_subset['OriginalReviews'])
X_lda = lda_pipeline.fit_transform(df_subset['OriginalReviews'])

# Get feature names from the CountVectorizer in the TF-IDF pipeline
tfidf_feature_names = tfidf_pipeline.named_steps['countvectorizer'].get_feature_names()

# Concatenate the existing DataFrame with the new features DataFrames
df_lda = pd.concat([
    df_subset,
    pd.DataFrame(X_tfidf.toarray(), columns=tfidf_feature_names),
    pd.DataFrame(X_lda, columns=[f"Topic_{i}" for i in range(num_topics)])
], axis=1)

df_lda.to_csv("../csv/separate_features_df.csv")


AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

In [None]:
df_lda

Unnamed: 0,OriginalReviews,OutputSentiment,PunctuationCount,StopwordsCount,AdjectiveCount,AdverbCount,TemporalWordCount,AdjectiveCount.1,AdverbCount.1,0
0,i really like this summerslam due to the look ...,positive,23,84,22,14,0,22,14,"(0, 100)\t0.09506747668032936\n (0, 811)\t0..."
1,not many television show appeal to quite a man...,positive,72,162,39,20,0,39,20,"(0, 96)\t0.058933434637839684\n (0, 553)\t0..."
2,the film quickly get to a major chase scene wi...,negative,22,54,8,8,0,8,8,"(0, 1242)\t0.08513785978361742\n (0, 1968)\..."
3,jane austen would definitely approve of this o...,positive,50,39,13,13,0,13,13,"(0, 1088)\t0.10148153949160933\n (0, 2148)\..."
4,expectation be somewhat high for me when i go ...,negative,86,166,33,14,0,33,14,"(0, 612)\t0.05950598446226236\n (0, 1346)\t..."
...,...,...,...,...,...,...,...,...,...,...
4995,one of eastwoods best movie after he have sepa...,positive,4,24,4,2,0,4,2,"(0, 1607)\t0.11598554105252347\n (0, 2407)\..."
4996,my blur childhood memory have keep the echo of...,negative,52,59,19,7,0,19,7,"(0, 712)\t0.10573088796429253\n (0, 1166)\t..."
4997,i love zombiemovies and i love amateurproducti...,negative,51,56,12,12,0,12,12,"(0, 1282)\t0.05867925161486874\n (0, 1607)\..."
4998,chan be in new york and he get involve with an...,positive,27,88,15,17,0,15,17,"(0, 991)\t0.03464006112331686\n (0, 1784)\t..."


## CONNOTATIONS

In [81]:
# Download the VADER lexicon (run this once)
nltk.download('vader_lexicon')
delimiter = '\t'

# Read the text file into a DataFrame
positive = pd.read_csv(r'..\Connotations\positive-words.txt', sep=delimiter, names=['words'])
negative = pd.read_csv(r'..\Connotations\negative-words.txt', sep=delimiter, names=['words'])
connotations = pd.read_csv(r"..\Connotations\connotations.csv")

word_emotion_map = dict(zip(connotations['word'], connotations['emotion']))

# Assuming word_emotion_map and update_counts functions are defined

def update_counts(review):
    positive_count = sum(1 for word in review.split() if word in word_emotion_map and word_emotion_map[word] == 'positive')
    negative_count = sum(1 for word in review.split() if word in word_emotion_map and word_emotion_map[word] == 'negative')
    return positive_count, negative_count

# Apply the update_counts function row-wise and split the returned tuple into separate columns
df_lda[['Positive_Connotation_Count', 'Negative_Connotation_Count']] = df_lda.apply(lambda row: pd.Series(update_counts(row['OriginalReviews'])), axis=1)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\sdidd\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [82]:
# Load positive and negative words from files
positive_words_df = pd.read_csv(r'..\Connotations\positive-words.txt', header=None, names=['words'])
negative_words_df = pd.read_csv(r'..\Connotations\negative-words.txt', header=None, names=['words'])

# Convert DataFrame columns to sets
positive_words = set(positive_words_df['words'].tolist())
negative_words = set(negative_words_df['words'].tolist())

# Assuming 'tfidf_df_13k' is your DataFrame
# Define a function to update counts based on positive and negative words
def update_word_counts(review):
    positive_count = sum(1 for word in review.split() if word in positive_words)
    negative_count = sum(1 for word in review.split() if word in negative_words)
    return positive_count, negative_count

# Apply the function to the 'OriginalReviews' column and unpack the result into two new columns
df_lda[['Positive_Word_Count', 'Negative_Word_Count']] = df_lda.apply(lambda row: pd.Series(update_word_counts(row['OriginalReviews'])), axis=1)


In [83]:
from nltk.sentiment import SentimentIntensityAnalyzer
# Use VADER for sentiment analysis
sid = SentimentIntensityAnalyzer()

def vader_sentiment(review):
    scores = sid.polarity_scores(review)
    return scores['pos'] *100, scores['neg'] * 100

# Apply the function to the 'OriginalReviews' column and unpack the result into two new columns
df_lda[['Positive_VADER_Count', 'Negative_VADER_Count']] = df_lda.apply(lambda row: pd.Series(vader_sentiment(row['OriginalReviews'])), axis=1)

df_lda.to_csv("../csv/df_lda_connotations_vader.csv")

In [84]:
df_lda

Unnamed: 0,OriginalReviews,OutputSentiment,PunctuationCount,StopwordsCount,AdjectiveCount,AdverbCount,TemporalWordCount,AdjectiveCount.1,AdverbCount.1,0,Positive_Connotation_Count,Negative_Connotation_Count,Positive_Word_Count,Negative_Word_Count,Positive_VADER_Count,Negative_VADER_Count
0,i really like this summerslam due to the look ...,positive,23,84,22,14,0,22,14,"(0, 100)\t0.09506747668032936\n (0, 811)\t0...",73,44,7,9,12.6,10.3
1,not many television show appeal to quite a man...,positive,72,162,39,20,0,39,20,"(0, 96)\t0.058933434637839684\n (0, 553)\t0...",105,96,14,6,13.4,1.3
2,the film quickly get to a major chase scene wi...,negative,22,54,8,8,0,8,8,"(0, 1242)\t0.08513785978361742\n (0, 1968)\...",28,31,4,4,15.3,3.7
3,jane austen would definitely approve of this o...,positive,50,39,13,13,0,13,13,"(0, 1088)\t0.10148153949160933\n (0, 2148)\...",30,24,8,5,21.0,12.1
4,expectation be somewhat high for me when i go ...,negative,86,166,33,14,0,33,14,"(0, 612)\t0.05950598446226236\n (0, 1346)\t...",99,70,8,10,9.3,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,one of eastwoods best movie after he have sepa...,positive,4,24,4,2,0,4,2,"(0, 1607)\t0.11598554105252347\n (0, 2407)\...",9,11,3,0,19.7,0.0
4996,my blur childhood memory have keep the echo of...,negative,52,59,19,7,0,19,7,"(0, 712)\t0.10573088796429253\n (0, 1166)\t...",40,24,3,4,11.0,10.7
4997,i love zombiemovies and i love amateurproducti...,negative,51,56,12,12,0,12,12,"(0, 1282)\t0.05867925161486874\n (0, 1607)\...",41,26,11,3,19.4,5.6
4998,chan be in new york and he get involve with an...,positive,27,88,15,17,0,15,17,"(0, 991)\t0.03464006112331686\n (0, 1784)\t...",40,59,8,5,20.4,16.9


In [85]:
# text = tfidf_df_13k.iloc[4993]['OriginalReviews']

In [86]:
#tfidf_df_13k = pd.read_csv("../csv/tfidf_df_13k.csv")

In [87]:
#tfidf_df_13k_connotations = pd.read_csv('../csv/tfidf_df_13k_connotations_vader.csv')

In [88]:
# tfidf_df_13k_connotations = df_lda.drop('Unnamed: 0',axis=1)

In [89]:
df_lda_connotations = df_lda

In [90]:
df_lda_connotations

Unnamed: 0,OriginalReviews,OutputSentiment,PunctuationCount,StopwordsCount,AdjectiveCount,AdverbCount,TemporalWordCount,AdjectiveCount.1,AdverbCount.1,0,Positive_Connotation_Count,Negative_Connotation_Count,Positive_Word_Count,Negative_Word_Count,Positive_VADER_Count,Negative_VADER_Count
0,i really like this summerslam due to the look ...,positive,23,84,22,14,0,22,14,"(0, 100)\t0.09506747668032936\n (0, 811)\t0...",73,44,7,9,12.6,10.3
1,not many television show appeal to quite a man...,positive,72,162,39,20,0,39,20,"(0, 96)\t0.058933434637839684\n (0, 553)\t0...",105,96,14,6,13.4,1.3
2,the film quickly get to a major chase scene wi...,negative,22,54,8,8,0,8,8,"(0, 1242)\t0.08513785978361742\n (0, 1968)\...",28,31,4,4,15.3,3.7
3,jane austen would definitely approve of this o...,positive,50,39,13,13,0,13,13,"(0, 1088)\t0.10148153949160933\n (0, 2148)\...",30,24,8,5,21.0,12.1
4,expectation be somewhat high for me when i go ...,negative,86,166,33,14,0,33,14,"(0, 612)\t0.05950598446226236\n (0, 1346)\t...",99,70,8,10,9.3,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,one of eastwoods best movie after he have sepa...,positive,4,24,4,2,0,4,2,"(0, 1607)\t0.11598554105252347\n (0, 2407)\...",9,11,3,0,19.7,0.0
4996,my blur childhood memory have keep the echo of...,negative,52,59,19,7,0,19,7,"(0, 712)\t0.10573088796429253\n (0, 1166)\t...",40,24,3,4,11.0,10.7
4997,i love zombiemovies and i love amateurproducti...,negative,51,56,12,12,0,12,12,"(0, 1282)\t0.05867925161486874\n (0, 1607)\...",41,26,11,3,19.4,5.6
4998,chan be in new york and he get involve with an...,positive,27,88,15,17,0,15,17,"(0, 991)\t0.03464006112331686\n (0, 1784)\t...",40,59,8,5,20.4,16.9


In [91]:
df_statistical = df_lda_connotations.drop(columns=['OriginalReviews','AdjectiveCount','AdverbCount','StopwordsCount','PunctuationCount','Positive_Connotation_Count','Negative_Connotation_Count','Positive_Word_Count','Negative_Word_Count','Positive_VADER_Count','Negative_VADER_Count'], axis=1)
df_statistical.head()

Unnamed: 0,OutputSentiment,TemporalWordCount,0
0,positive,0,"(0, 100)\t0.09506747668032936\n (0, 811)\t0..."
1,positive,0,"(0, 96)\t0.058933434637839684\n (0, 553)\t0..."
2,negative,0,"(0, 1242)\t0.08513785978361742\n (0, 1968)\..."
3,positive,0,"(0, 1088)\t0.10148153949160933\n (0, 2148)\..."
4,negative,0,"(0, 612)\t0.05950598446226236\n (0, 1346)\t..."


In [92]:
label = LabelEncoder()
df_statistical['OutputSentiment'] = label.fit_transform(df_statistical['OutputSentiment'])

## CHI SQAURE

In [93]:
# Assuming required columns are 'StopwordsCount', 'PunctuationCount', etc.
required_columns = ['AdverbCount','AdjectiveCount','StopwordsCount', 'PunctuationCount', 'Positive_Connotation_Count',
                    'Negative_Connotation_Count', 'Positive_Word_Count', 'Negative_Word_Count',
                    'Positive_VADER_Count', 'Negative_VADER_Count']

In [94]:
# This will get the top 5000 relavant features out of the sample
chi2_selector = SelectKBest(chi2, k=5000)

# This will transform the dataset i.e, it will reduce the dimensions by just considering the relavant features only
X = df_statistical.drop(columns=['OutputSentiment'])
y = df_statistical['OutputSentiment']
X_5000 = chi2_selector.fit_transform(X, y)

# Get the indices of the selected features
selected_feature_indices = chi2_selector.get_support(indices=True)

# Get the names of the selected features
selected_feature_names = X.columns[selected_feature_indices]

chisq_5k = X[selected_feature_names]
chisq_5k.head()

chisq_5k = pd.concat([chisq_5k,df_lda_connotations[required_columns]],axis=1)
chisq_5k.head()

TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.

In [None]:
# This will get the top 8000 relavant features out of the sample
chi2_selector = SelectKBest(chi2, k=8000)

# This will transform the dataset i.e, it will reduce the dimensions by just considering the relavant features only
X = df_statistical.drop(columns=['OutputSentiment'])
y = df_statistical['OutputSentiment']
X_8000 = chi2_selector.fit_transform(X, y)

# Get the indices of the selected features
selected_feature_indices = chi2_selector.get_support(indices=True)

# Get the names of the selected features
selected_feature_names = X.columns[selected_feature_indices]

chisq_8k = X[selected_feature_names]
chisq_8k.head()

chisq_8k = pd.concat([chisq_8k,df_lda_connotations[required_columns]],axis=1)
chisq_8k.head()



Unnamed: 0,TemporalWordCount,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,...,AdverbCount,AdjectiveCount,StopwordsCount,PunctuationCount,Positive_Connotation_Count,Negative_Connotation_Count,Positive_Word_Count,Negative_Word_Count,Positive_VADER_Count,Negative_VADER_Count
0,0,0.000686,0.000686,0.014282,0.000686,0.000686,0.000686,0.000686,0.000686,0.000686,...,14,22,84,23,73,44,7,9,12.6,10.3
1,0,0.000493,0.000493,0.000493,0.000493,0.000493,0.000493,0.000493,0.018071,0.000493,...,20,39,162,72,105,96,14,6,13.4,1.3
2,0,0.000748,0.000748,0.000748,0.000748,0.000748,0.000748,0.000748,0.000748,0.000748,...,8,8,54,22,28,31,4,4,15.3,3.7
3,0,0.000795,0.000795,0.000795,0.000795,0.000795,0.000795,0.000795,0.000795,0.000795,...,13,13,39,50,30,24,8,5,21.0,12.1
4,0,0.000585,0.000585,0.000585,0.000585,0.000585,0.000585,0.000585,0.000585,0.000585,...,14,33,166,86,99,70,8,10,9.3,9.5


In [None]:
# from sklearn.feature_selection import SelectKBest, f_regression

# # For 5000 relevant features
# cor_selector_5k = SelectKBest(f_regression, k=5000)

# # Transform the dataset to reduce dimensions by considering only the relevant features
# X = df_statistical.drop(columns=['OutputSentiment'])
# y = df_statistical['OutputSentiment']
# X_5000 = cor_selector_5k.fit_transform(X, y)

# # Get the indices of the selected features
# selected_feature_indices_5k = cor_selector_5k.get_support(indices=True)

# # Get the names of the selected features
# selected_feature_names_5k = X.columns[selected_feature_indices_5k]

# cor_5k = X[selected_feature_names_5k]
# cor_5k.head()

# cor_5k = pd.concat([cor_5k, tfidf_df_13k_connotations.iloc[:, -6:]], axis=1)
# cor_5k.head()

# # For 8000 relevant features
# cor_selector_8k = SelectKBest(f_regression, k=8000)

# # Transform the dataset to reduce dimensions by considering only the relevant features
# X = df_statistical.drop(columns=['OutputSentiment'])
# y = df_statistical['OutputSentiment']
# X_8000 = cor_selector_8k.fit_transform(X, y)

# # Get the indices of the selected features
# selected_feature_indices_8k = cor_selector_8k.get_support(indices=True)

# # Get the names of the selected features
# selected_feature_names_8k = X.columns[selected_feature_indices_8k]

# cor_8k = X[selected_feature_names_8k]
# cor_8k.head()

# cor_8k = pd.concat([cor_8k, tfidf_df_13k_connotations.iloc[:, -6:]], axis=1)
# cor_8k.head()

In [None]:
# from sklearn.feature_selection import SelectKBest, mutual_info_regression

# # For 5000 relevant features using mutual information
# info_gain_selector_5k = SelectKBest(mutual_info_regression, k=5000)

# # Transform the dataset to reduce dimensions by considering only the relevant features
# X = df_statistical.drop(columns=['OutputSentiment'])
# y = df_statistical['OutputSentiment']
# X_5000 = info_gain_selector_5k.fit_transform(X, y)

# # Get the indices of the selected features
# selected_feature_indices_5k = info_gain_selector_5k.get_support(indices=True)

# # Get the names of the selected features
# selected_feature_names_5k = X.columns[selected_feature_indices_5k]

# info_gain_5k = X[selected_feature_names_5k]
# info_gain_5k.head()

# info_gain_5k = pd.concat([info_gain_5k, tfidf_df_13k_connotations.iloc[:, -6:]], axis=1)
# info_gain_5k.head()

# # For 8000 relevant features using mutual information
# info_gain_selector_8k = SelectKBest(mutual_info_regression, k=8000)

# # Transform the dataset to reduce dimensions by considering only the relevant features
# X = df_statistical.drop(columns=['OutputSentiment'])
# y = df_statistical['OutputSentiment']
# X_8000 = info_gain_selector_8k.fit_transform(X, y)

# # Get the indices of the selected features
# selected_feature_indices_8k = info_gain_selector_8k.get_support(indices=True)

# # Get the names of the selected features
# selected_feature_names_8k = X.columns[selected_feature_indices_8k]

# info_gain_8k = X[selected_feature_names_8k]
# info_gain_8k.head()

# info_gain_8k = pd.concat([info_gain_8k, tfidf_df_13k_connotations.iloc[:, -6:]], axis=1)
# info_gain_8k.head()

Unnamed: 0,007,007s,0080,010,1010,1010br,10dirmick,10yearold,1110,112,...,zu,zucco,zunz,zwick,Positive_Connotation_Count,Negative_Connotation_Count,Positive_Word_Count,Negative_Word_Count,Positive_VADER_Count,Negative_VADER_Count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,55,24,7,9,20.3,12.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,77,50,14,6,21.9,2.9
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,20,18,4,4,23.1,9.1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,22,18,7,5,32.4,10.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,69,42,8,10,15.9,14.8


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Assuming chisq_8k is your DataFrame
columns_to_normalize = ['Positive_Connotation_Count', 'Negative_Connotation_Count', 
                         'Positive_Word_Count', 'Negative_Word_Count', 
                         'Positive_VADER_Count', 'Negative_VADER_Count']

scaler = MinMaxScaler()
chisq_8k[columns_to_normalize] = scaler.fit_transform(chisq_8k[columns_to_normalize])

In [None]:
chisq_8k

Unnamed: 0,TemporalWordCount,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,...,AdverbCount,AdjectiveCount,StopwordsCount,PunctuationCount,Positive_Connotation_Count,Negative_Connotation_Count,Positive_Word_Count,Negative_Word_Count,Positive_VADER_Count,Negative_VADER_Count
0,0,0.000686,0.000686,0.014282,0.000686,0.000686,0.000686,0.000686,0.000686,0.000686,...,14,22,84,23,0.217666,0.140000,0.104478,0.134328,0.234201,0.210634
1,0,0.000493,0.000493,0.000493,0.000493,0.000493,0.000493,0.000493,0.018071,0.000493,...,20,39,162,72,0.318612,0.313333,0.208955,0.089552,0.249071,0.026585
2,0,0.000748,0.000748,0.000748,0.000748,0.000748,0.000748,0.000748,0.000748,0.000748,...,8,8,54,22,0.075710,0.096667,0.059701,0.059701,0.284387,0.075665
3,0,0.000795,0.000795,0.000795,0.000795,0.000795,0.000795,0.000795,0.000795,0.000795,...,13,13,39,50,0.082019,0.073333,0.119403,0.074627,0.390335,0.247444
4,0,0.000585,0.000585,0.000585,0.000585,0.000585,0.000585,0.000585,0.000585,0.000585,...,14,33,166,86,0.299685,0.226667,0.119403,0.149254,0.172862,0.194274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0.001097,0.001097,0.001097,0.001097,0.001097,0.001097,0.001097,0.001097,0.001097,...,2,4,24,4,0.015773,0.030000,0.044776,0.000000,0.366171,0.000000
4996,0,0.000721,0.000721,0.000721,0.000721,0.000721,0.000721,0.000721,0.000721,0.000721,...,7,19,59,52,0.113565,0.073333,0.044776,0.059701,0.204461,0.218814
4997,0,0.000708,0.000708,0.000708,0.000708,0.000708,0.000708,0.000708,0.000708,0.000708,...,12,12,56,51,0.116719,0.080000,0.164179,0.044776,0.360595,0.114519
4998,0,0.000680,0.000680,0.000680,0.000680,0.000680,0.010992,0.000680,0.000680,0.000680,...,17,15,88,27,0.113565,0.190000,0.119403,0.074627,0.379182,0.345603


## CLASSIFICATION

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

# Multinomial Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_scores = cross_val_score(nb_classifier, chisq_8k, y, cv=5)

print("Multinomial Naive Bayes Cross-Validation Scores:")
print(nb_scores)
print("Mean Accuracy:", np.mean(nb_scores))

# k-Nearest Neighbors Classifier
knn_classifier = KNeighborsClassifier()
knn_scores = cross_val_score(knn_classifier, chisq_5k, y, cv=5)

print("\nk-Nearest Neighbors Cross-Validation Scores:")
print(knn_scores)
print("Mean Accuracy:", np.mean(knn_scores))

Multinomial Naive Bayes Cross-Validation Scores:
[0.583 0.608 0.624 0.597 0.616]
Mean Accuracy: 0.6056

k-Nearest Neighbors Cross-Validation Scores:
[0.725 0.716 0.714 0.683 0.715]
Mean Accuracy: 0.7106


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Load your data
# Assuming X and y are your features and target variables

# Initialize models
svm_model = SVC(kernel='linear')  # Linear SVM
logistic_model = LogisticRegression()

# Initialize KFold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform 5-fold cross-validation for SVM
svm_scores = cross_val_score(svm_model, chisq_5k, y, cv=kfold)

# Perform 5-fold cross-validation for Logistic Regression
logistic_scores = cross_val_score(logistic_model, chisq_8k, y, cv=kfold)

# Display the cross-validation scores
print("SVM Cross-validation scores:", svm_scores)
print("Logistic Regression Cross-validation scores:", logistic_scores)

# Optionally, you can calculate mean and standard deviation of the scores
print("SVM Mean Accuracy:", np.mean(svm_scores))
print("SVM Standard Deviation of Accuracy:", np.std(svm_scores))
print("Logistic Regression Mean Accuracy:", np.mean(logistic_scores))
print("Logistic Regression Standard Deviation of Accuracy:", np.std(logistic_scores))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

SVM Cross-validation scores: [0.766 0.768 0.756 0.745 0.774]
Logistic Regression Cross-validation scores: [0.762 0.765 0.755 0.743 0.771]
SVM Mean Accuracy: 0.7618
SVM Standard Deviation of Accuracy: 0.010205880657738468
Logistic Regression Mean Accuracy: 0.7592
Logistic Regression Standard Deviation of Accuracy: 0.009600000000000008


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import StratifiedKFold

# Assuming chisq_8k has features and y is the output

# Encode categorical labels if needed
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(chisq_8k, y_encoded, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build a simple neural network model
model = Sequential()
model.add(Dense(128, input_dim=X_train_scaled.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, verbose=1)

# Evaluate the model on the test set
y_pred = (model.predict(X_test_scaled) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)

print("Neural Network Accuracy on Test Set:", accuracy)

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 941us/step - accuracy: 0.6096 - loss: 0.6610
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7570 - loss: 0.4981
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 957us/step - accuracy: 0.7686 - loss: 0.4749
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 931us/step - accuracy: 0.7883 - loss: 0.4516
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 919us/step - accuracy: 0.8027 - loss: 0.4221
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 914us/step - accuracy: 0.7999 - loss: 0.4303
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 944us/step - accuracy: 0.8156 - loss: 0.3934
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 915us/step - accuracy: 0.8206 - loss: 0.3898
Epoch 9/10
[1m125/125[0m [32m━━━━━

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

# Assuming chisq_8k has features and y is the output

# Encode categorical labels if needed
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(chisq_8k, y_encoded, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 
# 
# 
# This is Custom Optimizer
# 
# 
# 

optimizer = tf.keras.optimizers.experimental.Adagrad(
    learning_rate=0.1,
    initial_accumulator_value=0.1,
    epsilon=1e-07,
    weight_decay=0.001,
    clipnorm=None,
    clipvalue=None,
    global_clipnorm=None,
    use_ema=False,
    ema_momentum=0.99,
    ema_overwrite_frequency=None,
    jit_compile=True,
    name='Adagrad',
)


# # Build a simple neural network model
# model = ""
# model = Sequential()
# model.add(Dense(128, input_dim=X_train_scaled.shape[1], activation='relu'))
# model.add(Dense(64, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(
    X_train_scaled, y_train, 
    epochs=50, batch_size=32, 
    validation_split=0.15,  # Using a portion of training set for validation
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate the model on the test set
y_pred = (model.predict(X_test_scaled) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)

print("Neural Network Accuracy on Test Set:", accuracy)


AttributeError: module 'keras.optimizers' has no attribute 'experimental'