In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import random

In [60]:
df = pd.read_csv("twitter_new.csv", encoding='latin-1', header=None, names=['target', 'ids', 'date', 'flag', 'user', 'text'])

In [61]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [62]:
df['target'].value_counts()

0    800000
4    800000
Name: target, dtype: int64

In [63]:
relevant_columns = ['target', 'text']
filtered_dataset = df[relevant_columns]
filtered_dataset.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [64]:
X = filtered_dataset['text']
y = filtered_dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [65]:
vectorizer = TfidfVectorizer(max_features=20000)

In [66]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.fit_transform(X_test)

In [67]:
X_train

<1280000x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 14037570 stored elements in Compressed Sparse Row format>

In [68]:
X_test

<320000x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 3516558 stored elements in Compressed Sparse Row format>

# Logistic Regression

In [69]:
classifier = LogisticRegression(max_iter=1000)

In [70]:
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

In [71]:
accuracy = accuracy_score(y_test,y_pred)


In [72]:
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.55


# Vader

In [73]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [74]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Gokul\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [75]:
analyzer = SentimentIntensityAnalyzer()

In [76]:
def get_sentiment(score):
    if score > 0 :
        return 4
    else:
        return 0

In [77]:
df['vader_score'] = df['text'].apply(lambda x : analyzer.polarity_scores(x)['compound'])

In [78]:
df['vader_sentiment'] = df['vader_score'].apply(get_sentiment)

In [79]:
accuracy = accuracy_score(df['target'], df['vader_sentiment'])
print(f"Vader sentiment accuracy: {accuracy:.2f}")

Vader sentiment accuracy: 0.65


# TextBlob

In [80]:
from textblob import TextBlob

In [81]:
text = "mercedesashley Damn! The grind is inspirational and saddening at the same time.  Don't want you to stop cuz I like what u do! Much love"

polarity  = TextBlob(text).sentiment.polarity
polarity

0.3333333333333333

In [82]:
new_df = filtered_dataset

In [83]:
new_df

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [84]:
new_df['sentiment'] = df['text'].apply(lambda x : TextBlob(x).sentiment.polarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [85]:
new_df['transformed_sentiment'] = new_df['sentiment'].apply(get_sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [86]:
accuracy = accuracy_score(df['target'], new_df['transformed_sentiment'])
print(f"TextBlob sentiment accuracy: {accuracy:.2f}")

TextBlob sentiment accuracy: 0.62


In [87]:
new_df

Unnamed: 0,target,text,sentiment,transformed_sentiment
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0.216667,4
1,0,is upset that he can't update his Facebook by ...,0.000000,0
2,0,@Kenichan I dived many times for the ball. Man...,0.500000,4
3,0,my whole body feels itchy and like its on fire,0.200000,4
4,0,"@nationwideclass no, it's not behaving at all....",-0.625000,0
...,...,...,...,...
1599995,4,Just woke up. Having no school is the best fee...,1.000000,4
1599996,4,TheWDB.com - Very cool to hear old Walt interv...,0.290000,4
1599997,4,Are you ready for your MoJo Makeover? Ask me f...,0.200000,4
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...,1.000000,4


# roBERTa


In [88]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

In [89]:
data_frame = df[['target', 'text']]
data_frame

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [90]:
#model
roberta = "cardiffnlp/twitter-roberta-base-sentiment"

In [91]:
model = AutoModelForSequenceClassification.from_pretrained(roberta)

In [92]:
tokenizer = AutoTokenizer.from_pretrained(roberta)

In [93]:
def analysis(tweet):
    tweet_words = []
    for word in tweet.split(' '):
        if word.startswith('@') and len(word)>1:
            word = '@user'
        elif word.startswith('http'):
            word = 'http'
        tweet_words.append(word)
    tweet_processed = ' '.join(tweet_words)
    encoded_tweet = tokenizer(tweet_processed, return_tensors='pt')
    output = model(**encoded_tweet)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    if (scores[0]>scores[2]):
        return 0
    elif (scores[0]<scores[2]):
        return 4
    else:
        return 2

In [94]:
X = data_frame['text']
y = data_frame['target']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [95]:
X_test = (pd.DataFrame(X_test))

In [96]:
y_pred = df['text'].head(1000).apply(lambda x : analysis(x))

In [97]:
accuracy = accuracy_score(y_pred, df['target'].head(1000))
print(f"roBERTa sentiment accuracy: {accuracy:.2f}")

roBERTa sentiment accuracy: 0.77


# Random Forest Classifier


In [3]:
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english', tokenizer=token.tokenize)

In [7]:
cv

CountVectorizer(stop_words='english',
                tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>)

In [16]:
text_counts = cv.fit_transform(reduced_df['text'])

In [55]:
def reduce_dataset(original_df):
    reduced_dataset = original_df.sample(n=200000)
    return reduced_dataset
reduced_df = reduce_dataset(df)

In [56]:
text_counts = cv.fit_transform(reduced_df['text'])

In [57]:
X_train,X_test, y_train, y_test = train_test_split(text_counts, reduced_df['target'], test_size=0.25, random_state=42)

In [58]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

from sklearn import metrics

predicted = dt.predict(X_test)
accuracy = metrics.accuracy_score(predicted,y_test)
accuracy

0.6936

In [38]:
X_train

<750x3194 sparse matrix of type '<class 'numpy.int64'>'
	with 5580 stored elements in Compressed Sparse Row format>