In [1]:
import pickle
with open('model\\sarcasm_model.sav', 'rb') as f:
    model = pickle.load(f, )
model

RandomForestClassifier()

In [47]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import re, string
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

sw = stopwords.words('english')
sw.remove('not')
lemm = WordNetLemmatizer()

def clean_tweet(tweet):
    
    tweet = tweet.lower()
    tweet = tweet.replace('\n', ' ')
    tweet = re.sub("'", "", tweet) 
    tweet = re.sub("@[A-Za-z0-9_]+","", tweet)
    tweet = re.sub("#[A-Za-z0-9_]+","", tweet)
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub('[()!?]', ' ', tweet)
    tweet = re.sub('\[.*?\]',' ', tweet)
    tweet = re.sub("[^a-z0-9]"," ", tweet)
    tweet = re.sub(' +', ' ', tweet)
    tweet = tweet.split()
    tweet = [w for w in tweet if not w in sw]
    tweet = " ".join(word for word in tweet)
    return tweet


def text_cleaning(x, lemm: WordNetLemmatizer):
    
    x = x.lower()
    x = re.sub('\s+\n+', ' ', x)
    x = re.sub('[^a-zA-Z0-9]', ' ', x)
    x = x.split()
    
    x = [lemm.lemmatize(word, "v") for word in x if not word in sw]
    x = ' '.join(x)
    
    return x


def make_sarcasm_dataset(path: str, path2: str):
    df  = pd.read_json(path, lines=True)
    df2 = pd.read_json(path2, lines=True)

    df = pd.concat([df, df2])
    return df

#--------------------------------   ------


def preprocessing(df_: pd.DataFrame) -> pd.DataFrame:
    df = df_.copy()
    del df['Unnamed: 0']
    del df['Source of Tweet']
    del df['Date Created']
    del df['Number of Likes']
    
    lb_sent = LabelEncoder()
    df['Sentiment'] = lb_sent.fit_transform(df.Sentiment)
    
    df['clean_tweet'] = df.Tweet.apply(lambda x: clean_tweet(x))

    tfidf = TfidfVectorizer(tokenizer=word_tokenize, min_df=10, max_df=0.90)
    X = tfidf.fit_transform(df.clean_tweet)
    y = df.loc[:, 'Sentiment']
    
    return X, y




def text_preprocessing(df_):
    df = df_.copy()
    
    df['text_clean'] = df.headline.apply(lambda x: text_cleaning(x, lemm))
    df['sentence_length'] = df.text_clean.apply(lambda x: len(x.split()))
    df = df[['headline', 'text_clean','sentence_length','is_sarcastic']]
    
    cv = CountVectorizer(tokenizer=word_tokenize, min_df=10, max_df=0.60, dtype=np.int32)
    X = cv.fit_transform(df.text_clean)

    df3 = add_sparse_matrix_to_dataframe(X, cv.vocabulary_, df)
    
    return df3


def add_sparse_matrix_to_dataframe(sparse_matrix, columns: list[str], df_: pd.DataFrame):
    df = df_.copy()
    
    # sm_df = pd.DataFrame(sm.todense(), columns=columns)
    sm_df = pd.DataFrame(sparse_matrix.toarray(), columns=columns)
    
    df.reset_index(inplace=True, drop=True)
    sm_df.reset_index(inplace=True, drop=True)

    return pd.concat([df, sm_df], axis = 1)



In [48]:

df_tweet = pd.read_csv('.\data\\fifa_world_cup_2022_tweets.csv')
df_tweet = df_tweet[['Tweet','Sentiment']]
df_tweet.columns = ['docs', 'y']
df_tweet['docs'] = df_tweet.docs.apply(lambda x: clean_tweet(x))
lb = LabelEncoder()
df_tweet['y'] = lb.fit_transform(df_tweet.y)
df_tweet.head()

Unnamed: 0,docs,y
0,drinking today,1
1,amazing launch video shows much face canada me...,2
2,worth reading watching,2
3,golden maknae shinning bright,2
4,bbc cares much human rights homosexual rights ...,0


In [49]:
df_sent = make_sarcasm_dataset('.\data\Sarcasm_Headlines_Dataset.json',
                               '.\data\Sarcasm_Headlines_Dataset_v2.json')

df_sent = df_sent[['headline','is_sarcastic']]
df_sent.columns = ['docs', 'y2']
df_sent['docs'] = df_sent.docs.apply(lambda x: text_cleaning(x, lemm))
df_sent.head()

Unnamed: 0,docs,y2
0,former versace store clerk sue secret black co...,0
1,roseanne revival catch thorny political mood b...,0
2,mom start fear son web series closest thing gr...,1
3,boehner want wife listen not come alternative ...,1
4,j k rowling wish snape happy birthday magical way,0


In [50]:
df = pd.concat([df_tweet, df_sent])
df

Unnamed: 0,docs,y,y2
0,drinking today,1.0,
1,amazing launch video shows much face canada me...,2.0,
2,worth reading watching,2.0,
3,golden maknae shinning bright,2.0,
4,bbc cares much human rights homosexual rights ...,0.0,
...,...,...,...
28614,jews celebrate rosh hashasha something,,1.0
28615,internal affairs investigator disappoint consp...,,1.0
28616,beautiful acceptance speech week come queer ko...,,0.0
28617,mar probe destroy orbit spielberg gate space p...,,1.0


In [51]:
def vectorize_col(df, col):
    vec = CountVectorizer(tokenizer=word_tokenize, min_df=10, max_df=0.60, dtype=np.int32)
    
    return add_sparse_matrix_to_dataframe(vec.fit_transform(df[col]), sorted(vec.vocabulary_), df)


In [52]:
%time df = vectorize_col(df, 'docs') # con todense()
df

CPU times: total: 11.2 s
Wall time: 13.7 s


Unnamed: 0,docs,y,y2,0,00,000,00pm,1,10,100,...,z,zakir,zealand,zero,zika,zimmerman,zip,zone,zoo,zuckerberg
0,drinking today,1.0,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,amazing launch video shows much face canada me...,2.0,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,worth reading watching,2.0,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,golden maknae shinning bright,2.0,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,bbc cares much human rights homosexual rights ...,0.0,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77847,jews celebrate rosh hashasha something,,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77848,internal affairs investigator disappoint consp...,,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77849,beautiful acceptance speech week come queer ko...,,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77850,mar probe destroy orbit spielberg gate space p...,,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# %time df = vectorize_col(df, 'docs') # con toarray()
# df

In [53]:
df.drinking

0        1
1        0
2        0
3        0
4        0
        ..
77847    0
77848    0
77849    0
77850    0
77851    0
Name: drinking, Length: 77852, dtype: int32

In [54]:
def add_sentence_length(df_: pd.DataFrame, sentences='docs'):
    df = df_
    df['sentence_length'] = df[sentences].apply(lambda x: len(x.split()))
    sentence_len = df.pop("sentence_length")

    # insert column with insert(location, column_name, column_value)

    df.insert(1, "sentence_length", sentence_len)
    return df

In [55]:
df = add_sentence_length(df, 'docs')
df

Unnamed: 0,docs,sentence_length,y,y2,0,00,000,00pm,1,10,...,z,zakir,zealand,zero,zika,zimmerman,zip,zone,zoo,zuckerberg
0,drinking today,2,1.0,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,amazing launch video shows much face canada me...,27,2.0,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,worth reading watching,3,2.0,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,golden maknae shinning bright,4,2.0,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,bbc cares much human rights homosexual rights ...,20,0.0,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77847,jews celebrate rosh hashasha something,5,,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77848,internal affairs investigator disappoint consp...,8,,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77849,beautiful acceptance speech week come queer ko...,7,,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77850,mar probe destroy orbit spielberg gate space p...,8,,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
df_sent.shape

(55328, 2)

In [57]:
df_tw = df.iloc[:df_tweet.shape[0], :]
df_tw.tail()

Unnamed: 0,docs,sentence_length,y,y2,0,00,000,00pm,1,10,...,z,zakir,zealand,zero,zika,zimmerman,zip,zone,zoo,zuckerberg
22519,go world cup 2022,4,2.0,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22520,anderlecht confirms former viborg ffs jesper f...,15,1.0,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22521,great thread read start,4,2.0,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22522,raphinha wants brazil united,4,2.0,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22523,buy sot pinksale confused buy tokens pinksale ...,18,1.0,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
df_sn = df.iloc[df_tw.shape[0]:, :]
df_sn.head()

Unnamed: 0,docs,sentence_length,y,y2,0,00,000,00pm,1,10,...,z,zakir,zealand,zero,zika,zimmerman,zip,zone,zoo,zuckerberg
22524,former versace store clerk sue secret black co...,10,,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22525,roseanne revival catch thorny political mood b...,8,,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22526,mom start fear son web series closest thing gr...,9,,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22527,boehner want wife listen not come alternative ...,10,,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22528,j k rowling wish snape happy birthday magical way,9,,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
X = df_tw.drop(['docs','y', 'y2'], axis=1)
y = df_tw.y

In [60]:
X2 = df_sn.drop(['docs','y', 'y2'], axis=1)
y2 = df_sn.y2

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.33)

lr_sent = LogisticRegression(solver='liblinear')
lr_sent.fit(X_train, y_train)
pred = lr_sent.predict(X_test)

print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.7028117852818512
              precision    recall  f1-score   support

         0.0       0.76      0.66      0.70      1929
         1.0       0.64      0.69      0.66      2715
         2.0       0.74      0.74      0.74      2789

    accuracy                           0.70      7433
   macro avg       0.71      0.70      0.70      7433
weighted avg       0.71      0.70      0.70      7433



In [64]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, random_state=42, test_size=0.33)

lr_sarc = LogisticRegression(solver='liblinear')
lr_sarc.fit(X_train, y_train)
pred = lr_sarc.predict(X_test)

print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.8408456103839203
              precision    recall  f1-score   support

         0.0       0.84      0.87      0.86      9843
         1.0       0.84      0.80      0.82      8416

    accuracy                           0.84     18259
   macro avg       0.84      0.84      0.84     18259
weighted avg       0.84      0.84      0.84     18259



In [66]:
%time lr_sent.fit(X, y)
%time lr_sarc.fit(X2, y2)

CPU times: total: 3.3 s
Wall time: 4.01 s
CPU times: total: 7.41 s
Wall time: 9.71 s


LogisticRegression(solver='liblinear')

In [67]:
pred_proba_sent = lr_sent.predict_proba(X2)
pred_proba_sarc = lr_sarc.predict_proba(X)

In [68]:
pred_proba_sent[:, 0]

array([0.13355952, 0.71686793, 0.1598501 , ..., 0.03081985, 0.47595039,
       0.51919278])

In [69]:
df_pred_sent = pd.DataFrame({'neg':pred_proba_sent[:, 0],'neu':pred_proba_sent[:, 1] , 'pos':pred_proba_sent[:, 2]}) 
print(df_pred_sent.head(), df_pred_sent.shape)                        
df_pred_sarc = pd.DataFrame({'is_S':pred_proba_sarc[:, 0], 'no_S':pred_proba_sarc[:, 1]})
print(df_pred_sarc.head(), df_pred_sarc.shape)                        

        neg       neu       pos
0  0.133560  0.757843  0.108597
1  0.716868  0.015402  0.267730
2  0.159850  0.425142  0.415007
3  0.498141  0.305921  0.195938
4  0.125624  0.006109  0.868267 (55328, 3)
       is_S      no_S
0  0.756618  0.243382
1  0.412793  0.587207
2  0.633365  0.366635
3  0.554803  0.445197
4  0.038537  0.961463 (22524, 2)


In [70]:
df_pred = pd.concat([df_pred_sarc, df_pred_sent])
df_pred

Unnamed: 0,is_S,no_S,neg,neu,pos
0,0.756618,0.243382,,,
1,0.412793,0.587207,,,
2,0.633365,0.366635,,,
3,0.554803,0.445197,,,
4,0.038537,0.961463,,,
...,...,...,...,...,...
55323,,,0.269275,0.084187,0.646538
55324,,,0.093058,0.055818,0.851124
55325,,,0.030820,0.029434,0.939746
55326,,,0.475950,0.387987,0.136063


In [71]:
X = pd.concat([X.reset_index(drop=True), df_pred_sarc.reset_index(drop=True)], axis=1)
X.head()

Unnamed: 0,sentence_length,0,00,000,00pm,1,10,100,1000,10am,...,zealand,zero,zika,zimmerman,zip,zone,zoo,zuckerberg,is_S,no_S
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.756618,0.243382
1,27,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.412793,0.587207
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.633365,0.366635
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.554803,0.445197
4,20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.038537,0.961463


In [72]:
X2 = pd.concat([X2.reset_index(drop=True), df_pred_sent.reset_index(drop=True)], axis=1)
X2.head()

Unnamed: 0,sentence_length,0,00,000,00pm,1,10,100,1000,10am,...,zero,zika,zimmerman,zip,zone,zoo,zuckerberg,neg,neu,pos
0,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.13356,0.757843,0.108597
1,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.716868,0.015402,0.26773
2,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.15985,0.425142,0.415007
3,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.498141,0.305921,0.195938
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.125624,0.006109,0.868267


In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.33)

clf_final = LogisticRegression(solver='liblinear')
clf_final.fit(X_train, y_train)
pred = clf_final.predict(X_test)

print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.7030808556437509
              precision    recall  f1-score   support

         0.0       0.76      0.66      0.71      1929
         1.0       0.64      0.69      0.66      2715
         2.0       0.74      0.74      0.74      2789

    accuracy                           0.70      7433
   macro avg       0.71      0.70      0.70      7433
weighted avg       0.71      0.70      0.70      7433



In [74]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, random_state=42, test_size=0.33)

lr = LogisticRegression(solver='liblinear')
lr.fit(X_train, y_train)
pred = lr.predict(X_test)

print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.8404622378005367
              precision    recall  f1-score   support

         0.0       0.84      0.87      0.85      9843
         1.0       0.84      0.80      0.82      8416

    accuracy                           0.84     18259
   macro avg       0.84      0.84      0.84     18259
weighted avg       0.84      0.84      0.84     18259



# Conclusioni

Si è potuto constatare che con l'aggiunta degli score dei due modelli, le metriche di valutazione dei modelli non sono cambiate drasticamente. Da ciò possiamo dedurre che il sarcasmo inglese non influisce sul sentimento espresso dal testo, e che dal sentimento del testo non possiamo favorire il riconoscimento del sarcasmo all'intenro dello stesso. Altre riflessioni, i due dataset sono completamente distinti e si riferiscono a due mondi completamente diversi, questo potrebbe aver influito sulle analisi svolte.
