In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd 
import nltk
from nltk.corpus import stopwords
# for stemming 
import re
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# run this if you have problems with nltk instalation 
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
# upload this datset from page https://www.kaggle.com/datasets/kazanova/sentiment140
# and give it name "sentiments.csv"
data = pd.read_csv("sentiments.csv", encoding='ISO-8859-1')

In [None]:
data

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
1599994,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [None]:
# columns names changing 

data.columns = ['label' if col == '0' else col for col in data.columns]
# 0 - negative emotional color
# 2 - neutral emotional color
# 4 = positive color


data.columns = ['id' if col == '1467810369' else col for col in data.columns]
data.columns = ['date' if col == 'Mon Apr 06 22:19:45 PDT 2009' else col for col in data.columns]
data.columns = ['query' if col == 'NO_QUERY' else col for col in data.columns]
data.columns = ['nickname' if col == '_TheSpecialOne_' else col for col in data.columns]

# It was not possible to change the name of the column with text - we use another method

In [None]:
# data output one more time 
data

Unnamed: 0,label,id,date,query,nickname,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
1599994,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [None]:
#change the name of the text column using its serial number (5)

columns_names = data.columns.tolist()

# Changing the column name by number
column_number_to_change = 5
# define new column name
new_column_name = 'text'
columns_names[column_number_to_change] = new_column_name

# Setting new column names
data.columns = columns_names

# Printing the DataFrame to make sure the column name has changed
print(data)

         label          id                          date     query  \
0            0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
1            0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
2            0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
3            0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4            0  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY   
...        ...         ...                           ...       ...   
1599994      4  2193601966  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599995      4  2193601969  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599996      4  2193601991  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599997      4  2193602064  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599998      4  2193602129  Tue Jun 16 08:40:50 PDT 2009  NO_QUERY   

                nickname                                               text  
0          scotthamilton  is upset that he can't update his Facebook by ...  
1  

In [None]:
# mix the data

# since the data is divided into three large parts according to their labels,
# such that the first few hundred thousand rows are labeled 0, the next are labeled two,
# and so on. This number would be too large for training.
# Therefore, so that the training set contains data with all three types of labels, we will mix them
data = data.sample(frac=1, random_state=42)


In [None]:
# output shuffled data
data

Unnamed: 0,label,id,date,query,nickname,text
541200,0,2200003313,Tue Jun 16 18:18:13 PDT 2009,NO_QUERY,DEWGetMeTho77,@Nkluvr4eva My poor little dumpling In Holmde...
750,0,1467998601,Mon Apr 06 23:11:18 PDT 2009,NO_QUERY,Young_J,I'm off too bed. I gotta wake up hella early t...
766711,0,2300049112,Tue Jun 23 13:40:12 PDT 2009,NO_QUERY,dougnawoschik,I havent been able to listen to it yet My spe...
285055,0,1993474319,Mon Jun 01 10:26:09 PDT 2009,NO_QUERY,thireven,now remembers why solving a relatively big equ...
705995,0,2256551006,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,taracollins086,"Ate too much, feel sick"
...,...,...,...,...,...,...
259178,0,1985362137,Sun May 31 16:57:40 PDT 2009,NO_QUERY,LasVegasEstate,I didn't. The link was wrong
1414414,4,2057029865,Sat Jun 06 12:14:24 PDT 2009,NO_QUERY,iwantsmore,@tommcfly yes!! mcfly twitter profile is an ex...
131932,0,1835640087,Mon May 18 06:26:26 PDT 2009,NO_QUERY,CosmicJase,@SarahFTW I know sometimes i just pretend i h...
671155,0,2246780236,Fri Jun 19 18:06:46 PDT 2009,NO_QUERY,AnnaCaverly,Cant believe you came and asked me that...


In [None]:
# the number of rows is too large - 1599999, I suggest taking only the first 20,000 of them
# but since the label column variables take on values ​​of 2 and 4 much later, we can take 15,000 tweets for each
#labelbla - 0, 2, 4

# or, in order not to find out starting from which tweet label takes values ​​2 and 4, we can simply
# shuffle the data (as done above)

# this code leaves only the first 20,000 lines in the data variable
data = data.iloc[:20000]



In [None]:
# output first 20.000 rows of our data
data

Unnamed: 0,label,id,date,query,nickname,text
541200,0,2200003313,Tue Jun 16 18:18:13 PDT 2009,NO_QUERY,DEWGetMeTho77,@Nkluvr4eva My poor little dumpling In Holmde...
750,0,1467998601,Mon Apr 06 23:11:18 PDT 2009,NO_QUERY,Young_J,I'm off too bed. I gotta wake up hella early t...
766711,0,2300049112,Tue Jun 23 13:40:12 PDT 2009,NO_QUERY,dougnawoschik,I havent been able to listen to it yet My spe...
285055,0,1993474319,Mon Jun 01 10:26:09 PDT 2009,NO_QUERY,thireven,now remembers why solving a relatively big equ...
705995,0,2256551006,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,taracollins086,"Ate too much, feel sick"
...,...,...,...,...,...,...
1374482,4,2051447103,Fri Jun 05 22:02:36 PDT 2009,NO_QUERY,_Jaska,@girlwonder24 Thanks.
667014,0,2245469948,Fri Jun 19 16:10:39 PDT 2009,NO_QUERY,julianicolao,"trying to study for the biggest test, next wee..."
1451234,4,2063022808,Sun Jun 07 01:05:46 PDT 2009,NO_QUERY,ElaineToni,Just finished watching Your Song Presents: Boy...
1181412,4,1982082859,Sun May 31 10:29:36 PDT 2009,NO_QUERY,lindseyrd20,@janfran813 awww i can't wait to get one


In [None]:
# check for empty values
data.isnull().sum()

label       0
id          0
date        0
query       0
nickname    0
text        0
dtype: int64

In [None]:
# stopwords downloading from nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maxkucher/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# stopwords printing
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
# to predict the emotional coloring of a tweet, we combine columns with user names and text

data["features"] = data["nickname"] + " " + data["text"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["features"] = data["nickname"] + " " + data["text"]


In [None]:
# let's pass the ["features"] data through the stemming function
stem = PorterStemmer()


In [None]:
def stemming(features):
    stemmed_data = stemmed_data = re.sub('[^a-zA-z]', ' ', features) 
    stemmed_data = stemmed_data.lower()
    stemmed_data = stemmed_data.split()
    stemmed_data = [stem.stem(word) for word in stemmed_data if not word in stopwords.words("english")]
    stemmed_data = ' '.join(stemmed_data)
    return stemmed_data

In [None]:
# apply stemming function to each feature
data["features"] = data["features"].apply(stemming)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["features"] = data["features"].apply(stemming)


In [None]:
# stemmed data output
print(data["features"])

541200     dewgetmetho nkluvr eva poor littl dumpl holmde...
750         young_j bed gotta wake hella earli tomorrow morn
766711      dougnawoschik havent abl listen yet speaker bust
285055     thireven rememb solv rel big equat two unknown...
705995                         taracollin ate much feel sick
                                 ...                        
1374482                                _jaska girlwond thank
667014     julianicolao tri studi biggest test next week ...
1451234        elainetoni finish watch song present boystown
1181412                  lindseyrd janfran awww wait get one
517910                                      serraannisa noth
Name: features, Length: 20000, dtype: object


In [None]:
x = data['features'].values
y = data['label'].values

In [None]:
print(x)
print(y)

['dewgetmetho nkluvr eva poor littl dumpl holmdel vid realli tri hope dont tri hard tonight xx'
 'young_j bed gotta wake hella earli tomorrow morn'
 'dougnawoschik havent abl listen yet speaker bust' ...
 'elainetoni finish watch song present boystown'
 'lindseyrd janfran awww wait get one' 'serraannisa noth']
[0 0 0 ... 4 4 0]


In [None]:
x.shape


(20000,)

In [None]:
y.shape

(20000,)

In [None]:
# convert the text data of the array x into numeric ones# 
vect = TfidfVectorizer()
vect.fit(x)

x = vect.transform(x)

In [None]:
print(x)

  (0, 41856)	0.20951986262935746
  (0, 40162)	0.2584097240008915
  (0, 38833)	0.34103274913907555
  (0, 38516)	0.17103535142636683
  (0, 31211)	0.15133143187662332
  (0, 29869)	0.2063492987956557
  (0, 27493)	0.33965620990185424
  (0, 22299)	0.18109710292023012
  (0, 16053)	0.15678883100621377
  (0, 15946)	0.33965620990185424
  (0, 15154)	0.19809009623286153
  (0, 11976)	0.3091751189879487
  (0, 10775)	0.3261681123005801
  (0, 10380)	0.1832912436474271
  (0, 9744)	0.33965620990185424
  (1, 42244)	0.5320485547157268
  (1, 40476)	0.34531660786790025
  (1, 38479)	0.2541683538962912
  (1, 25841)	0.2514703679850329
  (1, 15513)	0.4386825812918135
  (1, 14465)	0.3241884122824399
  (1, 10899)	0.3106656622721904
  (1, 3704)	0.2752965494817516
  (2, 42138)	0.28046060334419215
  (2, 35360)	0.43733472844417515
  :	:
  (19995, 102)	0.6781611625370226
  (19996, 41367)	0.19913220123162295
  (19996, 40710)	0.26058034559354337
  (19996, 38833)	0.25978966011489957
  (19996, 37918)	0.22489554936265002
 

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# using logistic regression model for multiple classification task 
model = LogisticRegression()

model.fit(x_train, y_train)

In [None]:
model.score(x_train, y_train)

0.875

In [None]:
# model accuracy is too low, we need set some parameters to make the model better
model.score(x_test, y_test)

0.73175

In [None]:
# hyperparameter tuning
from sklearn.model_selection import GridSearchCV

param_grid = {
    'penalty': ['l1', 'l2'],           # Regularization type (L1 or L2)
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Reverse Strength of Regularization
    'solver': ['liblinear', 'lbfgs'] # Optimization algorithm
}
# LogisticRegression() is set to estimator by default
grid_search = GridSearchCV(LogisticRegression(),param_grid, cv=5, scoring='accuracy')


# cv=5: This parameter indicates the number of blocks (folds)
# into which your training dataset will be split during cross-validation.
# In this case, 5-fold cross-validation is used, which means
# that the data will be divided into 5 blocks and the training/testing process will be repeated 5 times,
# each time using one of the blocks as a test data set and the remaining blocks
# as a training dataset.

# scoring='accuracy': This parameter defines the metric used to evaluate the quality of the model. In this case, the accuracy metric is used,
# which measures the proportion of correct predictions made by the model. A classification task can use various metrics,
# such as precision, recall, F1-score and others, depending on the specific requirements of the task.

In [None]:
# fitting the model by test data to find the best parameters
grid_search.fit(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
30 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the fa

In [None]:
# best parameters
grid_search.best_estimator_

In [None]:
# assign the best parameters to the model
model = grid_search.best_estimator_

In [474]:
# accuracy has increased significantly
model.score(x_test, y_test)

0.93

In [479]:
# model testing (works well)
n = 3

predicted_value = x_test[n]
prediction = model.predict(predicted_value)

print(f"Predicted value: {prediction}")
print(f"Actual value: {y_test[n]}")

Predicted value: [4]
Actual value: 4
