<a href="https://www.kaggle.com/code/hassangul/twitter-sentiment-analysis?scriptVersionId=117010191" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

**Liberaries will be used**

In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [2]:
val=pd.read_csv("/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv", header=None)
#Full dataset for Train-Test
train=pd.read_csv("/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv", header=None)

In [3]:
nltk.download("stopwords")
stop_words = [elements.lower() for elements in set(stopwords.words("english"))]

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


In [4]:
# Giving columns understandible names
train.rename(columns={2:"labels",1:"names",3:"tweets"}, inplace=True)

In [5]:
y = train["labels"]
X = train["tweets"]

In [6]:
X

0        im getting on borderlands and i will murder yo...
1        I am coming to the borders and I will kill you...
2        im getting on borderlands and i will kill you ...
3        im coming on borderlands and i will murder you...
4        im getting on borderlands 2 and i will murder ...
                               ...                        
74677    Just realized that the Windows partition of my...
74678    Just realized that my Mac window partition is ...
74679    Just realized the windows partition of my Mac ...
74680    Just realized between the windows partition of...
74681    Just like the windows partition of my Mac is l...
Name: tweets, Length: 74682, dtype: object

In [7]:
print(len(y))
print(len(X))

74682
74682


In [8]:
y.value_counts()

Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: labels, dtype: int64

In [9]:
# Encoding labels name using label encoder from sklearn.preprocessing
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [10]:
np.unique(y)

array([0, 1, 2, 3])

In [11]:
np.unique(encoder.inverse_transform(y))

array(['Irrelevant', 'Negative', 'Neutral', 'Positive'], dtype=object)

In [12]:
# Lowering text (tweets) and  replacing any extra/special/digits from tweets using regex
X = X.str.lower()
X.replace("[^a-zA-Z]"," ", regex=True, inplace=True)
X.head()

0    im getting on borderlands and i will murder yo...
1    i am coming to the borders and i will kill you...
2    im getting on borderlands and i will kill you ...
3    im coming on borderlands and i will murder you...
4    im getting on borderlands   and i will murder ...
Name: tweets, dtype: object

In [13]:
# Removing stop words from the tweets
def remove_stop (x):
    return ",".join([words for words in str(x).split() if words not in stop_words])

X = X.apply(lambda x: remove_stop(x))
X.head()

0    im,getting,borderlands,murder
1              coming,borders,kill
2      im,getting,borderlands,kill
3     im,coming,borderlands,murder
4    im,getting,borderlands,murder
Name: tweets, dtype: object

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
    test_size=0.2,shuffle = True, random_state = 42)

In [15]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(59745,)
(14937,)
(59745,)
(14937,)


In [16]:
# creating array/list of the tweets (train and test)
x_train_array = []
for row in range(0, len(X_train.index)):
    x_train_array.append("".join(x for x in X_train.iloc[row] ))
    
    
x_test_array = []
for row in range (0, len(X_test.index)):
    x_test_array.append("".join(x for x in X_test.iloc[row]))

In [17]:
# converting arrays of tweets into vector form
tfidfvector = TfidfVectorizer()
tfidf_train_dataset = tfidfvector.fit_transform(x_train_array)
tfidf_test_dataset = tfidfvector.transform(x_test_array)

# Logistic Regression Model

In [18]:
# Model decleration
lr_model = LR(C=10.0, max_iter=100,
                penalty='l2',
                   random_state=10, solver='liblinear' )

# training the model
lr_model.fit(tfidf_train_dataset,y_train)

# testing the model
predictions = lr_model.predict(tfidf_test_dataset)

In [19]:
matrix=confusion_matrix(y_test,predictions)
print(matrix)
score=accuracy_score(y_test,predictions)
print(score)
report=classification_report(y_test,predictions)
print(report)
kappa = cohen_kappa_score(y_test, predictions)
print(kappa)

[[2073  134  139  246]
 [  60 3991  172  296]
 [  77  200 3034  285]
 [  93  196  191 3750]]
0.8601459463078261
              precision    recall  f1-score   support

           0       0.90      0.80      0.85      2592
           1       0.88      0.88      0.88      4519
           2       0.86      0.84      0.85      3596
           3       0.82      0.89      0.85      4230

    accuracy                           0.86     14937
   macro avg       0.87      0.85      0.86     14937
weighted avg       0.86      0.86      0.86     14937

0.8104727481308267


# Testing the Model

In [20]:
test =  ["Its ok eluno musk given every one a good space to talk","I am ok whatever people will say about anyone","It depends on peoples mind set how they talk either dirty or good mind","pakistans politicians are not doing good job in their country"]
tes = tfidfvector.transform(test)

pre = lr_model.predict(tes)


In [21]:
print(encoder.inverse_transform(pre))

['Positive' 'Negative' 'Irrelevant' 'Neutral']


In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
forest_model = RandomForestClassifier()
forest_model.fit(tfidf_train_dataset, y_train)

prediction = forest_model.predict(tfidf_test_dataset)

matrix=confusion_matrix(y_test,predictions)
print(matrix)
score=accuracy_score(y_test,predictions)
print(score)
report=classification_report(y_test,predictions)
print(report)
kappa = cohen_kappa_score(y_test, predictions)
print(kappa)

[[2073  134  139  246]
 [  60 3991  172  296]
 [  77  200 3034  285]
 [  93  196  191 3750]]
0.8601459463078261
              precision    recall  f1-score   support

           0       0.90      0.80      0.85      2592
           1       0.88      0.88      0.88      4519
           2       0.86      0.84      0.85      3596
           3       0.82      0.89      0.85      4230

    accuracy                           0.86     14937
   macro avg       0.87      0.85      0.86     14937
weighted avg       0.86      0.86      0.86     14937

0.8104727481308267


> Open for suggestions, Looking forward to you guys 😊