In [1]:
# pip install opendatasets --upgrade --quiet

In [2]:
# import opendatasets as od
# # Download the Twitter sentiment analysis dataset
# dataset_url = 'https://www.kaggle.com/datasets/kazanova/sentiment140'
# od.download(dataset_url)

In [3]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aliha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [6]:
dataset = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

In [7]:
dataset.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [8]:
col_names = ['target', 'id', 'date', 'flag', 'user', 'text']
dataset.columns = col_names
dataset.head()


Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [9]:
dataset.shape

(1599999, 6)

In [10]:
dataset.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [11]:
dataset['target'].value_counts()

target
4    800000
0    799999
Name: count, dtype: int64

In [12]:
dataset['target'] = dataset['target'].map({4:1, 0:0}) # Convert sentiment labels to binary) 

In [13]:
dataset['target'].value_counts()

target
1    800000
0    799999
Name: count, dtype: int64

In [14]:
stremmer = PorterStemmer()

def stemming(context):
    stemmed_context = re.sub('[^a-zA-Z]', ' ', context)  # Remove non-alphabetic characters
    stemmed_context = stemmed_context.lower()  # Convert to lowercase
    stemmed_context = stemmed_context.split()  # Split into words
    stemmed_context = [stremmer.stem(word) for word in stemmed_context if not word in stopwords.words('english')]  # Remove stopwords and stem
    stemmed_context = ' '.join(stemmed_context)  # Join words back into a single string
    return stemmed_context


In [15]:
# dataset['text'] = dataset['text'].apply(stemming)

In [16]:
dataset.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [17]:
x = dataset['text']
y = dataset['target']

In [18]:
# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [19]:
#convert textual data into numerical data
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [20]:
print(x_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 15188133 stored elements and shape (1279999, 588958)>
  Coords	Values
  (0, 283678)	0.11963044873134729
  (0, 567458)	0.2117590432241851
  (0, 546444)	0.13110375568900662
  (0, 218982)	0.17560132527703673
  (0, 217891)	0.13830752789131484
  (0, 508644)	0.1865617459839415
  (0, 486053)	0.15429339772493936
  (0, 364589)	0.16075477058257603
  (0, 331512)	0.4827791362717877
  (0, 542863)	0.3941117269212658
  (0, 230550)	0.18237085926222382
  (0, 523010)	0.12969477325997483
  (0, 497114)	0.36145902204060715
  (0, 213371)	0.4691562640762748
  (1, 372613)	0.26682925664941753
  (1, 537250)	0.7375855377011513
  (1, 244120)	0.6202980915414966
  (2, 88575)	0.5163168725911714
  (2, 219086)	0.20861007959696407
  (2, 384515)	0.24859958550655706
  (2, 366620)	0.3956805935259804
  (2, 524038)	0.6866832135425449
  (3, 247783)	0.3993877526950337
  (3, 416279)	0.47938639525150156
  (3, 516131)	0.2951126194653137
  :	:
  (1279996, 200539)	0.365

In [21]:
#train the model
model = LogisticRegression()
model.fit(x_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [22]:
#test the model
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.79903125


In [23]:
#Function to predict sentiment of a new tweet
def predict_sentiment(tweet):
    text = re.sub('[^a-zA-Z]', ' ', tweet)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Split into words
    text = [stremmer.stem(word) for word in text if not word in stopwords.words('english')]  # Remove stopwords and stem
    text = ' '.join(text)  # Join words back into a single string
    text = [text]  # Convert to list
    text = vectorizer.transform(text)  # Convert to numerical data
    prediction = model.predict(text)  # Predict sentiment
    if prediction[0] == 0:
        return "Negative"
    else:
        return "Positive"

In [None]:
#example_tweet = "I love programming!"
print(predict_sentiment("I love programming!"))

Positive


In [31]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))

In [32]:
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))