In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


In [5]:
df = pd.read_csv(
    r"D:\Sudarshan Kasar\Capstone\Twitter Sentiment Analysis\Sentiment_cleaned.csv",
    encoding="latin1",
    header=None
)

df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']


In [None]:
df=df.drop(['id','date','flag','user'],axis=1)

In [10]:
df.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [11]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\GAURAV
[nltk_data]     DOND\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
def clean_text(text):
    text=text.lower()
    text=re.sub(r'http\S+','',text)
    text=re.sub(r'<.*?>','',text)
    text=re.sub(r'[^a-z\s]','',text)
    words=text.split()
    words=[w for w in words if w not in stop_words]
    return ' '.join(words)

df['Clean_text']=df['text'].apply(clean_text)

In [13]:
df.head()

Unnamed: 0,target,text,Clean_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot awww thats bummer shoulda got david...
1,0,is upset that he can't update his Facebook by ...,upset cant update facebook texting might cry r...
2,0,@Kenichan I dived many times for the ball. Man...,kenichan dived many times ball managed save re...
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....",nationwideclass behaving im mad cant see


In [14]:
df=df.drop('text',axis=1)

In [15]:
df.head()

Unnamed: 0,target,Clean_text
0,0,switchfoot awww thats bummer shoulda got david...
1,0,upset cant update facebook texting might cry r...
2,0,kenichan dived many times ball managed save re...
3,0,whole body feels itchy like fire
4,0,nationwideclass behaving im mad cant see


In [16]:
x_train,x_test,y_train,y_test=train_test_split(df['Clean_text'],df['target'],test_size=0.25,random_state=42)

In [17]:
vectorizer=TfidfVectorizer(max_features=50000)
x_train_vec=vectorizer.fit_transform(x_train)
x_test_vec=vectorizer.transform(x_test)

In [20]:
model = LogisticRegression(solver='saga', penalty='l2', max_iter=200, C=0.46)
model.fit(x_train_vec, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.46
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,200


In [21]:
y_pred=model.predict(x_test_vec)

In [22]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.76      0.78    199581
           4       0.77      0.81      0.79    200419

    accuracy                           0.79    400000
   macro avg       0.79      0.79      0.78    400000
weighted avg       0.79      0.79      0.78    400000



In [23]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression


In [24]:
param_distributions = {
    'C': np.logspace(-3, 3, 10),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 500]
}

In [32]:
random_search=RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distributions,
    n_iter=15,
    scoring='accuracy',
    cv=2,
    n_jobs=-1,
    verbose=2,
    random_state=42
    
)

In [30]:
#random_search.fit(x_train_vec,y_train)

In [34]:
model = LogisticRegression(
    solver='saga',
    penalty='l2',
    C=1.0,
    max_iter=300
)

model.fit(x_train_vec, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,300


In [40]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
import numpy as np

model = LogisticRegression(solver='saga', penalty='l2', max_iter=200)

param_dist = {
    'C': np.logspace(-3, 1, 10),
    'max_iter': [100, 200, 300]
}

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=5,       # very small → guaranteed to finish
    cv=2,           # very small → guaranteed to finish
    scoring='accuracy',
    n_jobs=1,       # <-- prevents worker crash
    verbose=2,
    random_state=42
)

print("Running search...")
random_search.fit(x_train_vec, y_train)  # This MUST finish
print("Search finished successfully!")

print("Best Params:", random_search.best_params_)
print("Best Score:", random_search.best_score_)


Running search...
Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] END ...............................C=10.0, max_iter=100; total time=  22.6s
[CV] END ...............................C=10.0, max_iter=100; total time=  21.0s
[CV] END .................C=0.1668100537200059, max_iter=100; total time=   7.6s
[CV] END .................C=0.1668100537200059, max_iter=100; total time=   7.2s
[CV] END .................C=1.2915496650148828, max_iter=300; total time=   6.8s
[CV] END .................C=1.2915496650148828, max_iter=300; total time=   8.2s
[CV] END .................C=0.1668100537200059, max_iter=300; total time=   8.2s
[CV] END .................C=0.1668100537200059, max_iter=300; total time=   7.0s
[CV] END ...............C=0.007742636826811269, max_iter=300; total time=   7.1s
[CV] END ...............C=0.007742636826811269, max_iter=300; total time=   7.3s
Search finished successfully!
Best Params: {'max_iter': 300, 'C': np.float64(1.2915496650148828)}
Best Score: 0.

In [41]:
random_search.best_params_

{'max_iter': 300, 'C': np.float64(1.2915496650148828)}

In [42]:
best_nb=random_search.best_estimator_
best_nb

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,np.float64(1.2915496650148828)
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,300


In [43]:
import pickle

with open("analysis_model.pkl", "wb") as f:
    pickle.dump(best_nb, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)   
