In [2]:
!pip install torch torchvision torchaudio
!pip install transformers 



In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer , AutoModel

In [4]:

file_path = "training.1600000.processed.noemoticon.csv"

columns = ['sentiment','id','date','query','user','text']

df = pd.read_csv(file_path,encoding='latin-1',names = columns)

In [5]:
df.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
print(df.shape)

(1600000, 6)


In [7]:
import re

def clean_tweet(text):
    text = str(text).lower()
    text = re.sub(r'@[\w_]+' , ' ' , text)
    text = re.sub(r'#',' ',text)
    text = re.sub(r'http\S+|www.\S+',' ',text)
    text = re.sub(r'\s+',' ',text).strip()
    return text
    

In [8]:
sample_df = df.sample(50000,random_state = 42).copy()
sample_df['clean_text'] = sample_df['text'].apply(clean_tweet)

print(sample_df[['text','clean_text']].head())

                                                     text  \
541200             @chrishasboobs AHHH I HOPE YOUR OK!!!    
750     @misstoriblack cool , i have no tweet apps  fo...   
766711  @TiannaChaos i know  just family drama. its la...   
285055  School email won't open  and I have geography ...   
705995                             upper airways problem    

                                               clean_text  
541200                             ahhh i hope your ok!!!  
750             cool , i have no tweet apps for my razr 2  
766711  i know just family drama. its lame.hey next ti...  
285055  school email won't open and i have geography s...  
705995                              upper airways problem  


In [9]:
sample_df['label'] = sample_df['sentiment'].replace(4,1)

In [10]:
print(sample_df['label'].head())

541200    0
750       0
766711    0
285055    0
705995    0
Name: label, dtype: int64


In [11]:
print(sample_df['label'].tail())

199266    0
210814    0
180674    0
364859    0
172400    0
Name: label, dtype: int64


In [12]:
sample_df['label'].value_counts()

label
1    25014
0    24986
Name: count, dtype: int64

In [13]:
train_text,test_text,train_label,test_label = train_test_split(sample_df['clean_text'].values, sample_df['label'].values,test_size = 0.2,random_state = 42)

In [14]:
xtrain = train_text
xtest = test_text
ytrain = train_label
ytest = test_label

In [15]:
print(len(train_text))
print(len(test_text))

40000
10000


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000,stop_words = 'english')

xtrain_tfidf = vectorizer.fit_transform(train_text)
xtest_tfidf = vectorizer.transform(test_text)

In [17]:
print(xtrain_tfidf.shape)
print(xtest_tfidf.shape)

(40000, 10000)
(10000, 10000)


In [18]:
print(vectorizer.get_feature_names_out()[:20])

['00' '000' '00am' '00pm' '01' '02' '04' '05' '06' '07' '08' '09' '10'
 '100' '1000' '100000' '1001' '100th' '101' '102']


In [19]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter = 200)

lr_model.fit(xtrain_tfidf,train_label)

In [20]:
ypred = lr_model.predict(xtest_tfidf)

In [21]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(ytest,ypred))
print(classification_report(ytest,ypred,target_names = ['Negitive','Positive']))

0.7494
              precision    recall  f1-score   support

    Negitive       0.76      0.73      0.74      4977
    Positive       0.74      0.77      0.75      5023

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000



In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C' : [0.01,0.1,1,10],
    'penalty' : ['l1','l2','elasticnet','none'],
    'solver' : ['liblinear','lbfgs','newton-cg','sag','saga'],
    'max_iter' : [100,200,300],
    'l1_ratio' : [0,0.5,1]
}

grid = GridSearchCV(LogisticRegression(),param_grid,cv = 3,scoring = 'f1',verbose = 2,n_jobs = -1)

grid.fit(xtrain_tfidf,ytrain)

Fitting 3 folds for each of 720 candidates, totalling 2160 fits


In [None]:
print(grid.best_params_)
print(grid.best_score_)

In [None]:
ypred2 = grid.predict(xtest_tfidf)
print(accuracy_score(ytest,ypred2))
print(classification_report(ytest,ypred2,target_names = ['Negitive','Positive']))

In [22]:
from sklearn.svm import LinearSVC

sv_model = LinearSVC()

sv_model.fit(xtrain_tfidf,ytrain)

In [23]:
ypred3 = sv_model.predict(xtest_tfidf)
print(accuracy_score(ytest,ypred3))
print(classification_report(ytest,ypred3,target_names = ['Negitive','Positive']))

0.7297
              precision    recall  f1-score   support

    Negitive       0.73      0.72      0.73      4977
    Positive       0.73      0.74      0.73      5023

    accuracy                           0.73     10000
   macro avg       0.73      0.73      0.73     10000
weighted avg       0.73      0.73      0.73     10000



In [24]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [26]:
model = Sequential([
    Dense(256, activation='relu', input_shape=(xtrain_tfidf.shape[1],)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid') 
])


model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

# Train model
history = model.fit(
    xtrain_tfidf.toarray(), ytrain,
    epochs=5,
    batch_size=512,
    validation_split=0.1,
    verbose=1
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 143ms/step - accuracy: 0.6891 - loss: 0.5988 - val_accuracy: 0.7325 - val_loss: 0.5245
Epoch 2/5
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 135ms/step - accuracy: 0.7912 - loss: 0.4505 - val_accuracy: 0.7380 - val_loss: 0.5283
Epoch 3/5
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 138ms/step - accuracy: 0.8288 - loss: 0.3891 - val_accuracy: 0.7295 - val_loss: 0.5660
Epoch 4/5
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 128ms/step - accuracy: 0.8617 - loss: 0.3306 - val_accuracy: 0.7190 - val_loss: 0.6190
Epoch 5/5
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 138ms/step - accuracy: 0.8969 - loss: 0.2613 - val_accuracy: 0.7097 - val_loss: 0.7264


In [27]:
y_pred = (model.predict(xtest_tfidf.toarray()) > 0.5).astype("int32")
print("Accuracy:", accuracy_score(ytest, y_pred))
print(classification_report(ytest, y_pred, target_names=['Negative', 'Positive']))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step
Accuracy: 0.7184
              precision    recall  f1-score   support

    Negative       0.73      0.69      0.71      4977
    Positive       0.71      0.75      0.73      5023

    accuracy                           0.72     10000
   macro avg       0.72      0.72      0.72     10000
weighted avg       0.72      0.72      0.72     10000



In [33]:
sample_tweets = [" I like a girl", "I addicted to mobile"]
cleaned_tweets = [clean_tweet(t) for t in sample_tweets]
sample_tfidf = vectorizer.transform(cleaned_tweets)
predictions = lr_model.predict(sample_tfidf)
for tweet, label in zip(sample_tweets, predictions):
    print(f"{tweet} --> {'Positive' if label == 1 else 'Negative'}")

 I like a girl --> Positive
I addicted to mobile --> Negative


In [37]:
import pickle
with open('vectorizer.pkl','wb') as f:
    pickle.dump(vectorizer,f)

with open('model.pkl','wb') as f:
    pickle.dump(lr_model,f)
    
import dill
with open('clean_tweet.pkl','wb') as f:
    dill.dump(clean_tweet,f)