In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [51]:
TWEET = pd.read_csv(
    '/content/TWITTERtraining.1600000.processed.noemoticon.csv',
    encoding='latin1',
    engine='python',
    quotechar='"',
    doublequote=True,
    on_bad_lines='skip'  # <- skips malformed lines
)


In [52]:
TWEET.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [53]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## DATA PROCESSING

In [54]:
TWEET.shape

(1599999, 6)

In [55]:
columnName = ['target','id','date','flag','user','text']

In [56]:
TWEET = pd.read_csv('/content/TWITTERtraining.1600000.processed.noemoticon.csv',names = columnName, encoding='latin1', engine='python', quotechar='"', doublequote=True, on_bad_lines='skip')

In [57]:
TWEET.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [58]:
TWEET.shape

(1600000, 6)

In [59]:
TWEET.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


In [60]:
TWEET['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


In [61]:
TWEET.replace({'target':{4:1}},inplace=True)

In [62]:
TWEET['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


In [63]:
PortSteam = PorterStemmer()

In [64]:
def Stemming(content):

  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  steemed_content = [PortSteam.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [65]:
TWEET['stemmed_data'] = TWEET['text'].apply(Stemming)

In [66]:
TWEET.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_data
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com y zl awww that s a...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can t update his facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan i dived many times for the ball manag...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass no it s not behaving at all i ...


In [67]:
x = TWEET['stemmed_data'].values
y = TWEET['target'].values

In [68]:
print(x)
print(y)

['switchfoot http twitpic com y zl awww that s a bummer you shoulda got david carr of third day to do it d'
 'is upset that he can t update his facebook by texting it and might cry as a result school today also blah'
 'kenichan i dived many times for the ball managed to save the rest go out of bounds'
 ... 'are you ready for your mojo makeover ask me for details'
 'happy th birthday to my boo of alll time tupac amaru shakur'
 'happy charitytuesday thenspcc sparkscharity speakinguph h']
[0 0 0 ... 1 1 1]


##  DATA SPLITTING

In [86]:
# 1. Vectorize the text
vectorizer = TfidfVectorizer(max_features=50000)
X_vectorized = vectorizer.fit_transform(x)

# 2. Split the data (preserving class balance)
x_train, x_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, stratify=y, random_state=42
)

# 3. Train the model
model = LogisticRegression()
model.fit(x_train, y_train)

# 4. Evaluate
x_pred_train = model.predict(x_train)
print(classification_report(y_train, x_pred_train))

x_pred_test = model.predict(x_test)
print(classification_report(y_test, x_pred_test))


              precision    recall  f1-score   support

           0       0.82      0.80      0.81    640000
           1       0.80      0.82      0.81    640000

    accuracy                           0.81   1280000
   macro avg       0.81      0.81      0.81   1280000
weighted avg       0.81      0.81      0.81   1280000

              precision    recall  f1-score   support

           0       0.81      0.79      0.80    160000
           1       0.79      0.81      0.80    160000

    accuracy                           0.80    320000
   macro avg       0.80      0.80      0.80    320000
weighted avg       0.80      0.80      0.80    320000



##  SAVING THE MODEL

In [88]:
import pickle

In [89]:
filename = 'TRAINED_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [90]:
Load_model = pickle.load(open('/content/TRAINED_model.sav', 'rb'))

In [95]:
x_new = x_test[200]
print(y_test[200])

prediction = model.predict(x_new)
print(prediction)

if(prediction[0]==0):
  print('Negative')
else:
  print('Positive')

0
[0]
Negative


In [97]:
x_new = x_test[2]
print(y_test[2])

prediction = model.predict(x_new)
print(prediction)

if(prediction[0]==0):
  print('Negative')
else:
  print('Positive')

1
[1]
Positive
