In [12]:
import pickle
with open('model\\sarcasm_model.sav', 'rb') as f:
    model = pickle.load(f, )
model

In [15]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import re, string
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


def clean_tweet(tweet):
    
    tweet = tweet.lower()
    tweet = tweet.replace('\n', ' ')
    tweet = re.sub("'", "", tweet) 
    tweet = re.sub("@[A-Za-z0-9_]+","", tweet)
    tweet = re.sub("#[A-Za-z0-9_]+","", tweet)
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub('[()!?]', ' ', tweet)
    tweet = re.sub('\[.*?\]',' ', tweet)
    tweet = re.sub("[^a-z0-9]"," ", tweet)
    tweet = re.sub(' +', ' ', tweet)
    tweet = tweet.split()
    tweet = [w for w in tweet if not w in sw]
    tweet = " ".join(word for word in tweet)
    return tweet

def make_tweets_dataset(path: str) -> tuple[np.ndarray, np.ndarray]:
    df = pd.read_csv(path)

    x, y = preprocessing(df)
    
    return x, y

def preprocessing(df_: pd.DataFrame) -> pd.DataFrame:
    df = df_.copy()
    del df['Unnamed: 0']
    del df['Source of Tweet']
    del df['Date Created']
    del df['Number of Likes']
    
    lb_sent = LabelEncoder()
    df['Sentiment'] = lb_sent.fit_transform(df.Sentiment)
    df['clean_tweet'] = df.Tweet.apply(lambda x: clean_tweet(x))

    tfidf = TfidfVectorizer(tokenizer=word_tokenize, min_df=10, max_df=0.90)
    X = tfidf.fit_transform(df.clean_tweet)
    y = df.loc[:, 'Sentiment']
    
    return X, y


In [1]:

X, y = make_tweets_dataset('.\data\\fifa_world_cup_2022_tweets.csv')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.6794672586015539
              precision    recall  f1-score   support

           0       0.71      0.61      0.66      1149
           1       0.62      0.70      0.66      1648
           2       0.73      0.71      0.72      1708

    accuracy                           0.68      4505
   macro avg       0.69      0.67      0.68      4505
weighted avg       0.68      0.68      0.68      4505



In [18]:
clf.predict_proba(X_test)

array([[0.12      , 0.77      , 0.11      ],
       [0.38106996, 0.19477366, 0.42415638],
       [0.182     , 0.428     , 0.39      ],
       ...,
       [0.72      , 0.16      , 0.12      ],
       [0.13      , 0.38      , 0.49      ],
       [0.54      , 0.41      , 0.05      ]])

In [19]:
model.predict_proba(X_test)



ValueError: X has 2481 features, but RandomForestClassifier is expecting 5766 features as input.