In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB , BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Ensure you have downloaded the necessary NLTK data files
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Zeel
[nltk_data]     soni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Zeel
[nltk_data]     soni\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to C:\Users\Zeel
[nltk_data]     soni\AppData\Roaming\nltk_data...


True

In [3]:
df=pd.read_csv('Twitter_Data.csv')
df

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


In [4]:
# Remove missing values
df = df.dropna()

In [5]:
# Convert text to lowercase
df['clean_text'] = df['clean_text'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_text'] = df['clean_text'].str.lower()


In [6]:
# Tokenize the text
df['tokens'] = df['clean_text'].apply(word_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens'] = df['clean_text'].apply(word_tokenize)


In [7]:
# Define stop words
stop_words = set(stopwords.words('english'))

# Remove stop words
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])


In [8]:
lemmatizer = WordNetLemmatizer()
df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [9]:
lemmatizer = WordNetLemmatizer()
df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [10]:
df['clean_text'] = df['tokens'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_text'] = df['tokens'].apply(lambda x: ' '.join(x))


In [11]:
y = df['category']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], y, test_size=0.2, random_state=42)

In [13]:
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

In [14]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [21]:
# Model Training using CountVectorizer
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_counts, y_train)
y_pred_tfidf = nb_tfidf.predict(X_test_tfidf)
print("TF-IDF Results")
print(f'Accuracy: {accuracy_score(y_test, y_pred_tfidf)}')
print(classification_report(y_test, y_pred_tfidf))

TF-IDF Results
Accuracy: 0.6368963612934896
              precision    recall  f1-score   support

        -1.0       0.81      0.35      0.48      7152
         0.0       0.83      0.43      0.57     11067
         1.0       0.57      0.94      0.71     14375

    accuracy                           0.64     32594
   macro avg       0.74      0.57      0.59     32594
weighted avg       0.71      0.64      0.61     32594



In [22]:
# Model Training using TF-IDF
tfidf = MultinomialNB()
tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = tfidf.predict(X_test_tfidf)
print("TF-IDF Results")
print(f'Accuracy: {accuracy_score(y_test, y_pred_tfidf)}')
print(classification_report(y_test, y_pred_tfidf))

TF-IDF Results
Accuracy: 0.5717616739277167
              precision    recall  f1-score   support

        -1.0       0.92      0.12      0.22      7152
         0.0       0.88      0.33      0.48     11067
         1.0       0.51      0.98      0.67     14375

    accuracy                           0.57     32594
   macro avg       0.77      0.48      0.46     32594
weighted avg       0.73      0.57      0.51     32594



In [23]:
# Model Training using CountVectorizer
nb_counts = BernoulliNB()
nb_counts.fit(X_train_counts, y_train)
y_pred_counts = nb_counts.predict(X_test_counts)
print("CountVectorizer Results")
print(f'Accuracy: {accuracy_score(y_test, y_pred_counts)}')
print(classification_report(y_test, y_pred_counts))

CountVectorizer Results
Accuracy: 0.7546174142480211
              precision    recall  f1-score   support

        -1.0       0.71      0.46      0.56      7152
         0.0       0.79      0.82      0.80     11067
         1.0       0.74      0.86      0.79     14375

    accuracy                           0.75     32594
   macro avg       0.75      0.71      0.72     32594
weighted avg       0.75      0.75      0.75     32594



In [26]:
import joblib

joblib.dump(nb_counts,'bnb.joblib')

loaded_model=joblib.load('bnb.joblib')

print(loaded_model.predict(X_test_counts))

[ 0.  1. -1. ...  0. -1.  0.]
