In [1]:
import pandas as pd

df = pd.read_csv('../data\\fifa_world_cup_2022_tweets.csv', index_col='Unnamed: 0')
df.head()

Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Tweet,Sentiment
0,2022-11-20 23:59:21+00:00,4,Twitter Web App,What are we drinking today @TucanTribe \n@MadB...,neutral
1,2022-11-20 23:59:01+00:00,3,Twitter for iPhone,Amazing @CanadaSoccerEN #WorldCup2022 launch ...,positive
2,2022-11-20 23:58:41+00:00,1,Twitter for iPhone,Worth reading while watching #WorldCup2022 htt...,positive
3,2022-11-20 23:58:33+00:00,1,Twitter Web App,Golden Maknae shinning bright\n\nhttps://t.co/...,positive
4,2022-11-20 23:58:28+00:00,0,Twitter for Android,"If the BBC cares so much about human rights, h...",negative


In [2]:
df.shape

(22524, 5)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22524 entries, 0 to 22523
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Date Created     22524 non-null  object
 1   Number of Likes  22524 non-null  int64 
 2   Source of Tweet  22524 non-null  object
 3   Tweet            22524 non-null  object
 4   Sentiment        22524 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.0+ MB


In [4]:
df.isnull().sum()

Date Created       0
Number of Likes    0
Source of Tweet    0
Tweet              0
Sentiment          0
dtype: int64

# Dataviz

In [5]:
df.describe()

Unnamed: 0,Number of Likes
count,22524.0
mean,23.822856
std,2128.018705
min,0.0
25%,0.0
50%,0.0
75%,2.0
max,316867.0


# Preprocessing

In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import re, string

sw = stopwords.words('english')
sw.remove('not')

def remove_stopwords(text):
    return [i for i in text if i not in sw]

In [7]:
def clean_tweet(tweet):
   
    tweet = tweet.lower()
    tweet = tweet.replace('\n', ' ')
    tweet = re.sub("'", "", tweet) # to avoid removing contractions in english
    tweet = re.sub("@[A-Za-z0-9_]+","", tweet)
    tweet = re.sub("#[A-Za-z0-9_]+","", tweet)
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub('[()!?]', ' ', tweet)
    tweet = re.sub('\[.*?\]',' ', tweet)
    tweet = re.sub("[^a-z0-9]"," ", tweet)
    tweet = re.sub(' +', ' ', tweet)
    tweet = tweet.split()
    tweet = [w for w in tweet if not w in sw]
    tweet = " ".join(word for word in tweet)
    return tweet

In [8]:
df.head()

Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Tweet,Sentiment
0,2022-11-20 23:59:21+00:00,4,Twitter Web App,What are we drinking today @TucanTribe \n@MadB...,neutral
1,2022-11-20 23:59:01+00:00,3,Twitter for iPhone,Amazing @CanadaSoccerEN #WorldCup2022 launch ...,positive
2,2022-11-20 23:58:41+00:00,1,Twitter for iPhone,Worth reading while watching #WorldCup2022 htt...,positive
3,2022-11-20 23:58:33+00:00,1,Twitter Web App,Golden Maknae shinning bright\n\nhttps://t.co/...,positive
4,2022-11-20 23:58:28+00:00,0,Twitter for Android,"If the BBC cares so much about human rights, h...",negative


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

Tfidf = CountVectorizer(preprocessor=clean_tweet, tokenizer=word_tokenize, min_df=10, max_df=0.90)
X = Tfidf.fit_transform(df.Tweet)

X_df = pd.DataFrame(X.todense(), columns=sorted(Tfidf.vocabulary_))
X_df.head()

Unnamed: 0,0,00,000,00pm,1,10,100,1000,10am,10k,...,yet,yo,youll,young,youre,youth,youtube,youve,zakir,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df.reset_index(inplace=True, drop=True)
X_df.reset_index(inplace=True, drop=True)

df2 = pd.concat([df, X_df], axis = 1)
df2.head()

Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Tweet,Sentiment,0,00,000,00pm,1,...,yet,yo,youll,young,youre,youth,youtube,youve,zakir,zero
0,2022-11-20 23:59:21+00:00,4,Twitter Web App,What are we drinking today @TucanTribe \n@MadB...,neutral,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2022-11-20 23:59:01+00:00,3,Twitter for iPhone,Amazing @CanadaSoccerEN #WorldCup2022 launch ...,positive,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2022-11-20 23:58:41+00:00,1,Twitter for iPhone,Worth reading while watching #WorldCup2022 htt...,positive,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2022-11-20 23:58:33+00:00,1,Twitter Web App,Golden Maknae shinning bright\n\nhttps://t.co/...,positive,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2022-11-20 23:58:28+00:00,0,Twitter for Android,"If the BBC cares so much about human rights, h...",negative,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
from sklearn.model_selection import train_test_split

X = df2.loc[:, df2.columns != 'Sentiment']
y = df2.loc[:, 'Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.33)

In [12]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()
X_train['Number of Likes'] = mm.fit_transform(X_train['Number of Likes'].to_numpy().reshape(-1, 1))

lb_source = LabelEncoder()
X_train['Source of Tweet'] = lb_source.fit_transform(X_train['Source of Tweet'])

lb_sent = LabelEncoder()
y_train = lb_sent.fit_transform(y_train)

In [13]:
from sklearn.metrics import accuracy_score

In [None]:
# from sklearn.tree import DecisionTreeClassifier

# model = DecisionTreeClassifier()
# model.fit(X_train, y_train)
# pred = model.predict(X_test)
# accuracy_score(y_test, pred)

In [14]:
del X_train['Date Created']
del X_train['Source of Tweet'] # da togliere
del X_train['Tweet']

del X_test['Date Created']
del X_test['Source of Tweet']# da togliere
del X_test['Tweet']


In [15]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)


RandomForestClassifier()

In [16]:
X_test['Number of Likes'] = mm.transform(X_test['Number of Likes'].to_numpy().reshape(-1, 1))


In [17]:

# X_test['Source of Tweet'] = lb_source.transform(X_test['Source of Tweet'])


In [18]:

y_test = lb_sent.transform(y_test)


predict proba

In [19]:

pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.6609713440064577

In [115]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.6972958428629087