### Predicting Tweet Labels With Multinomial Logistic Regression

In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


In [3]:
Data=pd.read_csv('coop2.csv')
clean_tweets=[]
for tweet in Data.text:
    tweet=re.sub("@[A-Za-z0-9]+", repl=' ',string=tweet)#removes @username/mentions
    tweet=re.sub('[^a-zA-Z]', repl=' ',string=tweet)#removes punctuations + special chars
    tweet=re.sub('(?:(https|http)\s?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)* ',repl=' ',string=tweet)#removes links
    clean_tweets.append(tweet)
    
Data['text']=pd.Series(clean_tweets)

In [5]:
#To remove the suggestions labele that is too small for the model
Suggestions=Data.loc[Data.label =='Suggestion']
Data.drop(Data.index[182],inplace=True)

In [6]:
#Vectorizing adn tokenizing X
Countvectorizer=CountVectorizer()
tweets=Data.text
X=Countvectorizer.fit_transform(tweets).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [8]:
y=Data.label.values

In [9]:
#Split data in the ration 70:30
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.3,stratify=y)

#Fit the model
multinomialNB=MultinomialNB()
multinomialNB.fit(X_train,y_train)

MultinomialNB()

In [10]:
#to predict values
y_pred=multinomialNB.predict(X_test)


#### Multinomial NB Results

In [11]:
#Results
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
accuracy=round(accuracy_score(y_test,y_pred)*100,2)
print(f'Multinomial Naive Bayes accuracy score : {accuracy}%')

Multinomial Naive Bayes accuracy score : 67.36%


In [12]:
y_pred_array=Countvectorizer.fit_transform(y_pred).toarray()
y_test_array=Countvectorizer.fit_transform(y_test).toarray()

#To compare the actual vs predicted values
pd.DataFrame({'Actual_values':y_test,'Predicted_values':y_pred})

Unnamed: 0,Actual_values,Predicted_values
0,Irrelevant,Irrelevant
1,Request,Negative
2,Irrelevant,Irrelevant
3,Positive,Negative
4,Negative,Negative
...,...,...
139,Irrelevant,Request
140,Irrelevant,Inquiry
141,Positive,Negative
142,Irrelevant,Irrelevant


### TWEET CLASSIFICATION WITH SVM

In [13]:
from sklearn.svm import LinearSVC

linearsvc=LinearSVC()
linearsvc.fit(X_train,y_train)

LinearSVC()

In [14]:
#predicting the labels
y_pred2=linearsvc.predict(X_test)

#To compare predicted vs actual labels
#To compare the actual vs predicted values
pd.DataFrame({'Actual_values':y_test,'Predicted_values':y_pred2})

Unnamed: 0,Actual_values,Predicted_values
0,Irrelevant,Irrelevant
1,Request,Request
2,Irrelevant,Irrelevant
3,Positive,Positive
4,Negative,Negative
...,...,...
139,Irrelevant,Request
140,Irrelevant,Irrelevant
141,Positive,Negative
142,Irrelevant,Irrelevant


#### SVM Results

In [16]:
print(accuracy_score(y_test,y_pred2))
classification_report(y_test,y_pred2)

0.6666666666666666


'              precision    recall  f1-score   support\n\n     Inquiry       0.00      0.00      0.00        10\n  Irrelevant       0.64      0.88      0.74        58\n    Negative       0.70      0.54      0.61        52\n    Positive       0.67      0.50      0.57         4\n     Request       0.75      0.75      0.75        20\n\n    accuracy                           0.67       144\n   macro avg       0.55      0.53      0.53       144\nweighted avg       0.63      0.67      0.64       144\n'