In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer, KNNImputer
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, LancasterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

nltk.download("stopwords")
nltk.download("wordnet")
pd.set_option('display.max_colwidth', None)


[nltk_data] Downloading package stopwords to /home/kf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/kf/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
df = pd.read_csv("./Tweets.csv")

In [14]:
colums_to_drop = [
    "tweet_id",
    "negativereason",
    "airline_sentiment_gold",
    "name",
    "negativereason_gold",
    "tweet_coord",
    "tweet_created",
    "tweet_location",
]

In [15]:
df = df[["text", "airline_sentiment"]]#.drop(colums_to_drop, axis=1)

### Remplacer les valeurs de la variable "airline sentiment" par des entiers égaux à 0, 1 et 2. 

In [16]:
df["target"] = df["airline_sentiment"].replace( {"positive": 2, "neutral": 1, "negative": 0} )

In [17]:
df = df.drop("airline_sentiment", axis = 1)

### Séparer le dataset en un jeu d'entraînement/validation et un jeu de test.

In [18]:
X = df.drop("target", axis=1)
y = df["target"]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=314)

In [20]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(11712, 1)
(2928, 1)
(11712,)
(2928,)


### Prétraitement de la variable "text"

In [21]:
    
def clean_text(text):
    res = text
    
    res = res.lower()
    
    # Suppression des "@..."
    res = re.sub("@\S+", "", res)
    
    # Suppression des urls
    res = re.sub("http[^\s]+|www\S+", "", res)
    
    # Suppression des #
    res = res.replace("#", "")
    
    # Suppression des nombres
    res = re.sub("\d+", "", res)
    
    # Suppression des "stop words" ("the", "in", "a"...)
    res = [word.strip() for word in res.split(" ") if word not in stopwords.words("english")]
    
    res = " ".join(res)
    
    # Suppression des caractères spéciaux
    # À faire AVANT la lemmatisation et racinisation
    for c in "#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~":
        res = res.replace(c, "")
    
    res = res.split(" ")
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    res = [lemmatizer.lemmatize(word) for word in res]

    # Racinisation
    stemmer = LancasterStemmer()
    res = [stemmer.stem(word) for word in res]
    
    res = " ".join(res)

    return res

def clean_text_2(X):
    ct = np.vectorize(clean_text)
    return ct(X)

In [22]:
text_pipeline = Pipeline(steps = [
    ("clean", FunctionTransformer(clean_text_2, feature_names_out = "one-to-one")),
    ("tfidf", TfidfVectorizer())
])

In [23]:
preprocessor = ColumnTransformer(transformers=[
    ("text", text_pipeline, "text"),
    
], remainder="passthrough", verbose=True)


In [24]:
preprocessor.fit(X_train)

[ColumnTransformer] .......... (1 of 1) Processing text, total=  28.2s


In [34]:
pipeline = Pipeline(steps = [
    ("preprocessor", preprocessor),
    ("rf", RandomForestClassifier())
])

In [35]:
%%time
pipeline.fit(X_train, y_train)

[ColumnTransformer] .......... (1 of 1) Processing text, total=  25.7s
CPU times: user 38.3 s, sys: 2.52 s, total: 40.8 s
Wall time: 42.6 s


In [47]:
sentences = [
    "Just touched down after an amazing flight! ✈️ The views from above never cease to amaze me. #FlyingHigh",
    "Neutral flight experience today. Smooth journey, but nothing particularly noteworthy. #Travel",
    "Feeling refreshed after a peaceful flight. Thankful for the opportunity to relax above the clouds. ☁️✈️ #TravelGoals",
    "Disappointed with the lack of legroom on this flight. Can barely stretch out! 😕 #TallPeopleProblems",
    "Another successful flight in the books! Thankful for the attentive cabin crew and smooth landing. #TravelTuesday",
    "Neutral vibes on this flight. Can't complain, can't rave. Just cruising at 30,000 feet. #FlyLife",
    "Flight delayed again? Come on, airline! You're killing my schedule here. 😒 #TravelWoes",
    "Positively surprised by the in-flight entertainment options. Made the journey fly by! 🎬✈️ #MovieBuff",
    "Mediocre service on this flight. Could definitely use some improvement in the hospitality department. #CustomerExperience",
    "That breathtaking sunrise view from the plane window made waking up early totally worth it. 🌅✈️ #MorningFlight",
    "Feeling a bit queasy after some turbulence on the flight. Hope it smooths out soon! 🤢 #MotionSickness",
    "Just landed in my favorite city after a smooth flight! Let the adventures begin. 🌇✈️ #Wanderlust",
    "Neutral flight experience - neither remarkable nor disappointing. Just another day in the skies. #TravelLife",
    "Missed my connection due to a delayed flight. Not the smooth travel day I was hoping for. 😩 #TravelProblems",
    "Absolutely thrilled with the upgraded seat on this flight! Talk about flying in style. 😎✈️ #LuxuryTravel",
    "Neutral flight today, but the sunset from above the clouds was breathtaking. 🌇✈️ #SkyViews",
    "Negative experience with lost luggage on this flight. Hoping the airline can sort it out ASAP. 😤 #LostAndFound",
    "Positive vibes only on this flight! Grateful for the opportunity to explore new destinations. #AdventureTime",
    "Disappointed by the lack of vegetarian meal options on this flight. Come on, airline, it's 2024! 🥗✈️ #DietaryNeeds",
    "Smooth takeoff, smooth landing. Can't ask for much more from a flight! #TravelBliss",
]

In [48]:
df_sentences = pd.DataFrame({"text": sentences})

In [59]:
y_pred = pipeline.predict(X_test)

In [61]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.93      0.85      1835
           1       0.64      0.42      0.51       620
           2       0.76      0.57      0.65       473

    accuracy                           0.76      2928
   macro avg       0.73      0.64      0.67      2928
weighted avg       0.75      0.76      0.75      2928



In [51]:
sentiments = {0: "negative",
              1: "neutre",
              2: "positive"
             }

In [56]:
for i in range(len(sentences)):
    print(sentences[i])
    print(f"Cette phrase semble {sentiments[y_pred[i]]}.")
    print()

Just touched down after an amazing flight! ✈️ The views from above never cease to amaze me. #FlyingHigh
Cette phrase semble negative.

Neutral flight experience today. Smooth journey, but nothing particularly noteworthy. #Travel
Cette phrase semble negative.

Feeling refreshed after a peaceful flight. Thankful for the opportunity to relax above the clouds. ☁️✈️ #TravelGoals
Cette phrase semble positive.

Disappointed with the lack of legroom on this flight. Can barely stretch out! 😕 #TallPeopleProblems
Cette phrase semble negative.

Another successful flight in the books! Thankful for the attentive cabin crew and smooth landing. #TravelTuesday
Cette phrase semble positive.

Neutral vibes on this flight. Can't complain, can't rave. Just cruising at 30,000 feet. #FlyLife
Cette phrase semble negative.

Flight delayed again? Come on, airline! You're killing my schedule here. 😒 #TravelWoes
Cette phrase semble negative.

Positively surprised by the in-flight entertainment options. Made the j