### dataset - Kaggle
## link: https://www.kaggle.com/datasets/kazanova/sentiment140

In [1]:
import pandas as pd    

In [2]:
column =["Target","id","time","flag","user","tweet"]
originaldata = pd.read_csv("twittersentiment.csv",encoding='latin')
originaldata.columns=column

In [3]:
originaldata.head()

Unnamed: 0,Target,id,time,flag,user,tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


## Cleaning

In [4]:
#removing the unwanted columns : id , time, flag,user
data = originaldata
data = data.drop(["id","time","flag","user"],axis=1)
data.head()

Unnamed: 0,Target,tweet
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [5]:
#removing for nan or null values - since our data depends on only the tweet column, is its nan or null, it doesnt provide any info
#hence we are removing
data.isna().sum()

Target    0
tweet     0
dtype: int64

In [6]:
data.isnull().sum()

Target    0
tweet     0
dtype: int64

## Analyze and visualize

In [7]:
# from the website:
# 4- positive
# 2- neutral
# 0- negative
data["Target"].value_counts()

4    800000
0    799999
Name: Target, dtype: int64

In [8]:
#graph

In [9]:
#breaking sentences down to small tokens
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)
#The fit_on_texts() method creates an association between the words and the assigned numbers. 
#This association is stored in the form of a dictionary in the tokenizer.word_index attribute.
tokenizer.fit_on_texts(data['tweet'])

In [10]:
data['tweet'].head()

0    is upset that he can't update his Facebook by ...
1    @Kenichan I dived many times for the ball. Man...
2      my whole body feels itchy and like its on fire 
3    @nationwideclass no, it's not behaving at all....
4                        @Kwesidei not the whole crew 
Name: tweet, dtype: object

In [11]:
#replace the  with the numbers
encoded_docs = tokenizer.texts_to_sequences(data['tweet'])

In [13]:
#making each sentence with equal length
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_sequence = pad_sequences(encoded_docs, maxlen=200)

In [14]:
print(padded_sequence)

[[   0    0    0 ...   40  273 1170]
 [   0    0    0 ...   37   31   12]
 [   0    0    0 ...   71   13 1169]
 ...
 [   0    0    0 ...   14   11 2107]
 [   0    0    0 ...  501   12   50]
 [   0    0    0 ...    0    0  119]]


## Spliting data

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequence,data["Target"], test_size=0.33, random_state=42)

In [16]:
from sklearn.metrics import accuracy_score

## Algorithms
##### 1. Linear Regression
##### 2. Support Vector Machines
##### 3. RNN derivatives LSTM and GRU.

In [17]:
#linear regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test,prediction))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5222575757575758


In [None]:
#SVM - linear
from sklearn import svm
lsvm = svm.SVC()
lsvm.fit(X_train, y_train)
prediction = lsvm.predict(X_test)
print(accuracy_score(y_test,prediction))

In [22]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

sgd = Pipeline([
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)


y_pred = sgd.predict(X_test)

print(accuracy_score(y_pred, y_test))

0.4967026515151515
