In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer, KNNImputer
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, LancasterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download("stopwords")
nltk.download("wordnet")
pd.set_option('display.max_colwidth', None)
cpt = 1

[nltk_data] Downloading package stopwords to /home/kf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/kf/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("./Tweets.csv")

In [3]:
df["airline_sentiment"].value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [4]:
df

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0000,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0000,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I need to take another trip!,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing about it,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14635,569587686496825344,positive,0.3487,,0.0000,American,,KristenReenders,,0,@AmericanAir thank you we got on a different flight to Chicago.,,2015-02-22 12:01:01 -0800,,
14636,569587371693355008,negative,1.0000,Customer Service Issue,1.0000,American,,itsropes,,0,@AmericanAir leaving over 20 minutes Late Flight. No warnings or communication until we were 15 minutes Late Flight. That's called shitty customer svc,,2015-02-22 11:59:46 -0800,Texas,
14637,569587242672398336,neutral,1.0000,,,American,,sanyabun,,0,@AmericanAir Please bring American Airlines to #BlackBerry10,,2015-02-22 11:59:15 -0800,"Nigeria,lagos",
14638,569587188687634433,negative,1.0000,Customer Service Issue,0.6659,American,,SraJackson,,0,"@AmericanAir you have my money, you change my flight, and don't answer your phones! Any other suggestions so I can make my commitment??",,2015-02-22 11:59:02 -0800,New Jersey,Eastern Time (US & Canada)


In [5]:
df.shape

(14640, 15)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

## Analyse

In [7]:
df.isna().sum()

tweet_id                            0
airline_sentiment                   0
airline_sentiment_confidence        0
negativereason                   5462
negativereason_confidence        4118
airline                             0
airline_sentiment_gold          14600
name                                0
negativereason_gold             14608
retweet_count                       0
text                                0
tweet_coord                     13621
tweet_created                       0
tweet_location                   4733
user_timezone                    4820
dtype: int64

In [8]:
df["retweet_count"].value_counts()

0     13873
1       640
2        66
3        22
4        17
5         5
7         3
6         3
22        2
8         1
32        1
28        1
9         1
18        1
11        1
31        1
15        1
44        1
Name: retweet_count, dtype: int64

In [9]:
df["negativereason"].value_counts()

Customer Service Issue         2910
Late Flight                    1665
Can't Tell                     1190
Cancelled Flight                847
Lost Luggage                    724
Bad Flight                      580
Flight Booking Problems         529
Flight Attendant Complaints     481
longlines                       178
Damaged Luggage                  74
Name: negativereason, dtype: int64

In [10]:
for t in df["text"].head(100):
    print(t)

@VirginAmerica What @dhepburn said.
@VirginAmerica plus you've added commercials to the experience... tacky.
@VirginAmerica I didn't today... Must mean I need to take another trip!
@VirginAmerica it's really aggressive to blast obnoxious "entertainment" in your guests' faces &amp; they have little recourse
@VirginAmerica and it's a really big bad thing about it
@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.
it's really the only bad thing about flying VA
@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)
@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP
@virginamerica Well, I didn't…but NOW I DO! :-D
@VirginAmerica it was amazing, and arrived an hour early. You're too good to me.
@VirginAmerica did you know that suicide is the second leading cause of death among teens 10-24
@VirginAmerica I &lt;3 pretty graphics. so much better than minimal iconography. :D
@V

In [11]:
df["user_timezone"].value_counts()

Eastern Time (US & Canada)    3744
Central Time (US & Canada)    1931
Pacific Time (US & Canada)    1208
Quito                          738
Atlantic Time (Canada)         497
                              ... 
Warsaw                           1
Bucharest                        1
Wellington                       1
Sarajevo                         1
Saskatchewan                     1
Name: user_timezone, Length: 85, dtype: int64

In [12]:
df["tweet_location"].value_counts()

Boston, MA              157
New York, NY            156
Washington, DC          150
New York                127
USA                     126
                       ... 
Seattle (duh!)            1
South ~O-H-I-O~ Side      1
Pocono Raceway            1
denver, co                1
Nigeria,lagos             1
Name: tweet_location, Length: 3081, dtype: int64

In [13]:
df["tweet_created"]

0        2015-02-24 11:35:52 -0800
1        2015-02-24 11:15:59 -0800
2        2015-02-24 11:15:48 -0800
3        2015-02-24 11:15:36 -0800
4        2015-02-24 11:14:45 -0800
                   ...            
14635    2015-02-22 12:01:01 -0800
14636    2015-02-22 11:59:46 -0800
14637    2015-02-22 11:59:15 -0800
14638    2015-02-22 11:59:02 -0800
14639    2015-02-22 11:58:51 -0800
Name: tweet_created, Length: 14640, dtype: object

In [14]:
df.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')

In [15]:
df["tweet_location"].value_counts()

Boston, MA              157
New York, NY            156
Washington, DC          150
New York                127
USA                     126
                       ... 
Seattle (duh!)            1
South ~O-H-I-O~ Side      1
Pocono Raceway            1
denver, co                1
Nigeria,lagos             1
Name: tweet_location, Length: 3081, dtype: int64

In [16]:
df["name"].value_counts()

JetBlueNews       63
kbosspotter       32
_mhertz           29
otisday           28
throthra          27
                  ..
ChrisJLeary        1
tracy_edes         1
NGottesman         1
chadlacalamita     1
sanyabun           1
Name: name, Length: 7701, dtype: int64

In [17]:
df["user_timezone"].value_counts()

Eastern Time (US & Canada)    3744
Central Time (US & Canada)    1931
Pacific Time (US & Canada)    1208
Quito                          738
Atlantic Time (Canada)         497
                              ... 
Warsaw                           1
Bucharest                        1
Wellington                       1
Sarajevo                         1
Saskatchewan                     1
Name: user_timezone, Length: 85, dtype: int64

In [18]:
colums_to_drop = [
    "tweet_id",
    "negativereason",
    "airline_sentiment_gold",
    "name",
    "negativereason_gold",
    "tweet_coord",
    "tweet_created",
    "tweet_location",
]

In [19]:
df = df.drop(colums_to_drop, axis=1)

### Remplacer les valeurs de la variable "airline sentiment" par des entiers égaux à 0, 1 et 2. 

In [20]:
df["target"] = df["airline_sentiment"].replace( {"positive": 2, "neutral": 1, "negative": 0} )

In [21]:
df = df.drop("airline_sentiment", axis = 1)

### Séparer le dataset en un jeu d'entraînement/validation et un jeu de test.

In [22]:
X = df.drop("target", axis=1)
y = df["target"]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=314)

In [24]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(11712, 6)
(2928, 6)
(11712,)
(2928,)


### Prétraitement des variables non textuelles

In [25]:
num_variables  = ["airline_sentiment_confidence", "negativereason_confidence", "retweet_count"]
cat_variables  = ["airline", "user_timezone"]
text_variables = ["text"]

In [26]:
num_pipeline = Pipeline(steps = [
    ("imputation", KNNImputer(n_neighbors=10)),
    ("scaler", MinMaxScaler())
])

cat_pipeline = Pipeline(steps = [
    ("imputation", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first"))
])

### Prétraitement de la variable "text"

In [27]:
def afficher(x, cpt):
    print(x)
    return cpt + 1
    
def clean_text(text):
    res = text
    
    res = res.lower()
    
    # Suppression des "@..."
    res = re.sub("@\S+", "", res)
    
    # Suppression des urls
    res = re.sub("http[^\s]+|www\S+", "", res)
    
    # Suppression des #
    res = res.replace("#", "")
    
    # Suppression des nombres
    res = re.sub("\d+", "", res)
    
    # Suppression des "stop words" ("the", "in", "a"...)
    res = [word.strip() for word in res.split(" ") if word not in stopwords.words("english")]
    
    res = " ".join(res)
    
    # Suppression des caractères spéciaux
    # À faire AVANT la lemmatisation et racinisation
    for c in "#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~":
        res = res.replace(c, "")
    
    res = res.split(" ")
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    res = [lemmatizer.lemmatize(word) for word in res]

    # Racinisation
    stemmer = LancasterStemmer()
    res = [stemmer.stem(word) for word in res]
    
    res = " ".join(res)

    return res

def clean_text_2(X):
    ct = np.vectorize(clean_text)
    return ct(X)

On peut éviter l'utilisation du Pipeline en traitant directement la colonne "text" ainsi:
```
X_train["text_clean"] = X_train["text"].apply(clean_text)
X_train = X_train.drop("text", axis=1)
X_test["text_clean"] = X_test["text"].apply(clean_text)
X_test = X_test.drop("text", axis=1)
```

Création du Pipeline de prétraitement du texte.  
On ajoute `feature_names_out = "one-to-one"` pour pouvoir récupérer les noms des colonnes après pré-traitement.

In [28]:
text_pipeline = Pipeline(steps = [
    ("clean", FunctionTransformer(clean_text_2, feature_names_out = "one-to-one")),
    ("tfidf", TfidfVectorizer())
])

In [29]:
preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, num_variables),
    ("cat", cat_pipeline, cat_variables),
    ("text", text_pipeline, "text"),
    
], remainder="passthrough", verbose=True)

Pour demander au ColumnTransformer de retourner un DataFrame plutôt qu'un np.array :
```
preprocessor.set_output(transform="pandas")
```
Mais ici, FunctionTransformer n'a pas de méthode set_output implémentée. Il faut donc le faire à la main en récupérant le nom des colonnes transformées.

In [30]:
preprocessor.fit(X_train)

[ColumnTransformer] ........... (1 of 3) Processing num, total=   2.2s
[ColumnTransformer] ........... (2 of 3) Processing cat, total=   0.0s
[ColumnTransformer] .......... (3 of 3) Processing text, total=  30.3s


In [35]:
X_train_clean = preprocessor.transform(X_train)
X_test_clean = preprocessor.transform(X_test)



In [37]:
X_train_df = pd.DataFrame(X_train_clean.todense(), columns=preprocessor.get_feature_names_out())
X_test_df = pd.DataFrame(X_test_clean.todense(), columns=preprocessor.get_feature_names_out())

In [42]:
# .to_csv prend beaucoup de temps avec un DataFrame contenant des Sparse Columns.On sauvegarde ici en pickle.
X_train_df.to_pickle("./X_train.pkl")
X_test_df.to_pickle("./X_test.pkl")
y_train.to_pickle("./y_train.pkl")
y_test.to_pickle("./y_test.pkl")