In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Twitter_Data.csv")
print(df.shape)
df.sample(5)


(162980, 2)


Unnamed: 0,clean_text,category
52194,not the ultimate authority crime even otherwis...,0.0
29036,spoiled hurt economistdrifted non economicle i...,0.0
112328,aam aadmi party files complaint with against m...,-1.0
125874,modi should take retirement and give youngster...,0.0
102387,modi shud take steps should talk pakistan,0.0


In [3]:
df['category'].value_counts()


category
 1.0    72250
 0.0    55213
-1.0    35510
Name: count, dtype: int64

In [4]:
df['sentiment_binary'] = df['category'].apply(lambda x: 1 if x == 1.0 else 0)

In [5]:
# Drop neutral sentiment (0.0)
df_binary = df[df['category'] != 0.0]

# Now you only have two classes: 1 (Positive) and -1 (Negative)
df_binary['sentiment_binary'] = df_binary['category'].apply(lambda x: 1 if x == 1.0 else 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_binary['sentiment_binary'] = df_binary['category'].apply(lambda x: 1 if x == 1.0 else 0)


In [6]:
df.sample(5)

Unnamed: 0,clean_text,category,sentiment_binary
31382,sir right bas jhalkh bhar thi napak barbadi pu...,1.0,1
43678,dekh modi saheb bolne aaye oubluc panic meaur ...,0.0,0
87001,over european parliamentarians write modi seek...,1.0,1
81004,crowds the front rows and rows empty chairs ri...,-1.0,0
157237,watch this and share this all reach lord ayyap...,0.0,0


In [7]:
df['sentiment_binary'].value_counts()

sentiment_binary
0    90730
1    72250
Name: count, dtype: int64

In [8]:
df.drop(columns='category', inplace=True)

In [9]:
df.sample(5)

Unnamed: 0,clean_text,sentiment_binary
25173,request you modi sir please help this,0
64561,pay tax file return\nwho modi declare the mone...,0
140239,bjp rss members and their bhaktas felt its dan...,0
47171,congress bakras giving credit nehru bjp bakras...,0
3498,planning finish what modi saved years,0


In [10]:
df.dropna(inplace=True)

In [11]:
df.isnull().sum()

clean_text          0
sentiment_binary    0
dtype: int64

In [12]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [13]:
stemmer = PorterStemmer()
def stemming(content):

    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [stemmer.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
    return ' '.join(stemmed_content)


df['clean_text'] = df['clean_text'].apply(stemming)

In [14]:
X = df[['clean_text']]
y = df['sentiment_binary']

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
# Check column names
print(df.columns)

# If extra spaces are found, strip them from column names
df.columns = df.columns.str.strip()

# Now assign your X and y
vectorizer = TfidfVectorizer()  # You can tweak max_features based on your data
X = vectorizer.fit_transform(df['clean_text'])
y = df['sentiment_binary']  # Make sure the column is correctly named


Index(['clean_text', 'sentiment_binary'], dtype='object')


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model_lr = LogisticRegression(class_weight='balanced', max_iter=1000)
model_lr.fit(X_train, y_train)


In [18]:

y_pred = model_lr.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.87      0.91      0.89     18090
           1       0.88      0.83      0.85     14506

    accuracy                           0.87     32596
   macro avg       0.87      0.87      0.87     32596
weighted avg       0.87      0.87      0.87     32596



In [19]:
from xgboost import XGBClassifier

In [20]:


xgb = XGBClassifier()  
xgb.fit(X_train, y_train)
print("CLASSIFICATION REPORT FOR RANDOM XGB CLASSIFIER")
y_xgb_pred = xgb.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)


CLASSIFICATION REPORT FOR RANDOM XGB CLASSIFIER
              precision    recall  f1-score   support

           0       0.87      0.91      0.89     18090
           1       0.88      0.83      0.85     14506

    accuracy                           0.87     32596
   macro avg       0.87      0.87      0.87     32596
weighted avg       0.87      0.87      0.87     32596



In [21]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


stemmer = PorterStemmer()

def predict_sentiment(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    text = text.split()
    text = [stemmer.stem(word) for word in text if word not in stopwords.words("english")]
    text = " ".join(text)
    
    # Step 6: Transform the text using the vectorizer
    text = vectorizer.transform([text])
    
    # Step 7: Predict the sentiment
    sentiment = model_lr.predict(text)
    
  
    if sentiment == 0:  # Assuming 0 for Negative
        return "Negative"
    else:  # Assuming 1 for Positive
        return "Positive"


In [22]:
print(predict_sentiment("i hate you"))
print(predict_sentiment("good"))

Negative
Positive


In [25]:
print(predict_sentiment("bad"))

Negative


In [24]:
import pickle
pickle.dump(model_lr, open("lr_model.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))