In [247]:
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [248]:
# TODO: Load the dataset 
df = pd.read_csv('trump_tweet_2016.csv',sep='|',on_bad_lines='skip')

In [249]:
df.info

<bound method DataFrame.info of                    source                                               text  \
0      Twitter for iPhone  RT @realDonaldTrump: Happy Birthday @DonaldJTr...   
1      Twitter for iPhone  Happy Birthday @DonaldJTrumpJr!https://t.co/uR...   
2     Twitter for Android  Happy New Year to all, including to my many en...   
3     Twitter for Android  Russians are playing @CNN and @NBCNews for suc...   
4      Twitter for iPhone  Join @AmerIcan32, founded by Hall of Fame lege...   
...                   ...                                                ...   
3913   Twitter for iPhone      #HappyNewYearAmerica! https://t.co/EeQb8PDrUe   
3914   Twitter for iPhone  HAPPY NEW YEAR &amp; THANK YOU! https://t.co/Y...   
3915  Twitter for Android  I will be on @FoxNews live,  with members of m...   
3916  Twitter for Android  I would like to wish everyone A HAPPY AND HEAL...   
3917  Twitter for Android  Do you believe that The State Department, on N...   

       

In [250]:
df.columns

Index(['source', 'text', 'created_at', 'retweet_count', 'favorite_count',
       'is_retweet', 'id_str'],
      dtype='object')

In [251]:
## target: source
df['source'].unique()
## hypothèse: Android -> Trump; autres -> team
df['target'] = df['source'].apply(lambda x: int('Android' in x))
df['target'].unique()

array([0, 1])

In [252]:
df['target'].value_counts()

0    2228
1    1690
Name: target, dtype: int64

In [253]:
def preprocessing(document):
    # 1- tokenization
    tokens = word_tokenize(document)
    # 2- punctuation removal
    tokens = [t.lower() for t in tokens if t.isalpha()]
    # 3- remove stopwords
    stop_words = stopwords.words('english')
    tokens = [t for t in tokens if not t in stop_words]
    # 4- lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens_lem = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

In [254]:
## voir comment faire pour éviter le data leakage aussi dans les pré-processing NLP:
## faire le split train / test avant ?
## split data
y = df['target'].to_numpy()
X = df.drop(columns=['source','target'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42, stratify=y)

In [255]:
## faire le preproc NLP sur train et test séparément
features = [col for col in df.columns if col not in ['source','target']]
features
df_train = pd.DataFrame(X_train)
df_train.columns = features

df_test = pd.DataFrame(X_test)
df_test.columns = features

df_test.shape

(784, 6)

In [256]:
df_train.head(5)

Unnamed: 0,text,created_at,retweet_count,favorite_count,is_retweet,id_str
3594,@red77angelluis: @realDonaldTrump @NeilTurner_...,01-25-2016 04:07:08,752,2823,False,691472343449870336
895,"Instead of driving jobs and wealth away, AMERI...",09-27-2016 01:28:04,7648,20743,False,780579728964980736
3621,Failing @GlennBeck lost all credibility. Not o...,01-23-2016 16:54:37,2028,4565,False,690940712578195457
1030,More poll results from last nights Commander-i...,09-08-2016 23:56:22,10284,25291,False,774033670239760384
2028,Have a great Memorial Day and remember that we...,05-30-2016 11:26:47,9159,25545,False,737243856144629760


### TRES IMPORTANT !! VECTORIZATION: fitter le vectorizer uniquement sur le train (pour ne pas avoir de Data Leakage!)
### puis l'appliquer séparément à Train et Test

In [257]:
## Vectorization de TRAIN: voir 3 blocs + bas
"""
vectorizer = CountVectorizer(analyzer=lambda x: x)
vectorizer.fit(X_train)

BOW_train = vectorizer.fit_transform([preprocessing(x) for x in df_train.text]).toarray()
colonnes = vectorizer.get_feature_names_out()

df_bow_train = pd.DataFrame(data=BOW_train, columns=colonnes)  ## être sûr du phasage entre les valeurs et les colonnes
df_bow_train
"""

'\nvectorizer = CountVectorizer(analyzer=lambda x: x)\nvectorizer.fit(X_train)\n\nBOW_train = vectorizer.fit_transform([preprocessing(x) for x in df_train.text]).toarray()\ncolonnes = vectorizer.get_feature_names_out()\n\ndf_bow_train = pd.DataFrame(data=BOW_train, columns=colonnes)  ## être sûr du phasage entre les valeurs et les colonnes\ndf_bow_train\n'

In [258]:
## pas de vectorization de test sur la base de test
## vectorization de test sur la base de la vectorization fittée sur train
"""
vectorizer = CountVectorizer(analyzer=lambda x: x)
BOW_test = vectorizer.fit_transform([preprocessing(x) for x in df_test.text]).toarray()
colonnes = vectorizer.get_feature_names_out()

df_bow_test = pd.DataFrame(data=BOW_test, columns=colonnes)  ## être sûr du phasage entre les valeurs et les colonnes
df_bow_test #784x2515
"""
### !!! PB de DIMENSIONS: Nb features >> Nb Rows !!!!!

'\nvectorizer = CountVectorizer(analyzer=lambda x: x)\nBOW_test = vectorizer.fit_transform([preprocessing(x) for x in df_test.text]).toarray()\ncolonnes = vectorizer.get_feature_names_out()\n\ndf_bow_test = pd.DataFrame(data=BOW_test, columns=colonnes)  ## être sûr du phasage entre les valeurs et les colonnes\ndf_bow_test #784x2515\n'

### !!! PB de DIMENSIONS: Nb features >> Nb Rows !!!!! voir comment faire dans ce cas

In [259]:
y_test.shape

(784,)

In [260]:
## sans distinction sur la target
print(f"BOW in spam case: {df_bow_train.sum(axis=0).sort_values(ascending=False)[:10]}")

BOW in spam case: https              1575
thank               488
great               471
hillary             397
trump               348
amp                 319
realdonaldtrump     318
twitter             280
clinton             240
america             228
dtype: int64


In [261]:
## essai avec les lignes issues du code de V.Malara
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0,stratify=y)
# instanciation du vectorizer
#comprendre pourquoi on utilise dans certains cas : lambda x:x et d'autres la version ci-dessous
vectorizer = CountVectorizer(stop_words='english')

# entraînement du vectorizer sur la base de train uniquement
vectorizer.fit(X_train) ## vectorizer entrainé sur la base de train uniquement pour ne pas avoir de data leakage

# application du vectorizer sur train et test pour la création des BOW sur train et test
X_bow_train = pd.DataFrame(vectorizer.transform(X_train).toarray(), columns=vectorizer.get_feature_names_out())
X_bow_test  = pd.DataFrame(vectorizer.transform(X_test).toarray(), columns=vectorizer.get_feature_names_out())


### CLASSIFICATION

In [262]:
## Perform a logistic regression to predict whether a message is a spam or not
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,recall_score,precision_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer, KNNImputer

In [263]:
def scores_model(model, X_train, X_test, y_train, y_test):
    metrics = {}

    y_train_pred = model.predict(X_train)
    y_test_pred  = model.predict(X_test)

    ## F1-score
    metrics['f1score_train'] = f1score_train = f1_score(y_train, y_train_pred)
    metrics['f1score_test']  = f1score_test  = f1_score(y_test, y_test_pred)

    ## PRECISION
    metrics['precision_train'] = precision_train = precision_score(y_train,y_train_pred)
    metrics['precision_test']  = precision_test  = precision_score(y_test,y_test_pred)

    ## RECALL
    metrics['recall_train'] = recall_train = recall_score(y_train,y_train_pred)
    metrics['recall_test']  = recall_test  = recall_score(y_test,y_test_pred)

    return metrics

In [264]:
X_bow_train = df_bow_train
X_bow_test = df_bow_test

In [265]:
param_dist = {
    'C': [0.2,0.5],
     }

lr = LogisticRegression()
##grid = RandomizedSearchCV(lr, param_dist, cv=3, n_iter=30, scoring = 'f1')
##grid.fit(X_bow_train, y_train)
##print(f'grid.best_params_: {grid.best_params_}')
print(X_bow_test.head())
print(X_bow_test.shape)
print(X_bow_train.shape)
#y_train_pred = grid.predict(X_bow_train)
#y_test_pred = grid.predict(X_bow_test)
lr.fit(X_bow_train, y_train)
y_test_pred = lr.predict(X_bow_test)
''

dict_scores_models = {}
#dict_scores_lr = scores_model(grid, X_bow_train, X_bow_test, y_train, y_test)
dict_scores_lr = scores_model(lr, X_bow_train, X_bow_test, y_train, y_test)
dict_scores_models['LogisticRegression'] = dict_scores_lr
dict_scores_models

   abandon  abc  abcpolitics  able  abolish  absentee  absolutely  abused  \
0        0    0            0     0        0         0           0       0   
1        0    0            0     0        0         0           0       0   
2        0    0            0     0        0         0           0       0   
3        0    0            0     0        0         0           0       0   
4        0    0            0     0        0         0           0       0   

   abuser  accept  ...  yes  yesterday  yet  york  yorkers  youngstown  youth  \
0       0       0  ...    0          0    0     0        0           0      0   
1       0       0  ...    0          0    0     0        0           0      0   
2       0       0  ...    0          0    1     0        0           0      0   
3       0       0  ...    0          0    0     0        0           0      0   
4       0       0  ...    0          0    0     0        0           0      0   

   youtube  zero  zuckerman  
0        0     0    

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- abused
- abuser
- achieve
- add
- address
- ...
Feature names seen at fit time, yet now missing:
- abandoned
- abdeslam
- abedin
- aberdeen
- ability
- ...
