## Lecture CSV

In [52]:
import pandas as pd
import plotly.express as px

In [53]:
data_train = pd.read_csv("../datas/train.csv")
data_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


---

## Préparation, Nettoyage, EDA

Basic stats

In [54]:
print("Nombre de lignes : {}".format(data_train.shape[0]))
print()

print("Affichage du dataset : ")
display(data_train.head())
print()

print("Statistiques basiques : ")
data_desc = data_train.describe(include="all")
display(data_desc)
print()

print("Valeurs manquantes : ")
display(data_train.isna().sum())

print("Pourcentage de valeurs manquantes : ")
display(100 * data_train.isnull().sum() / data_train.shape[0])

Nombre de lignes : 7613

Affichage du dataset : 


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1



Statistiques basiques : 


Unnamed: 0,id,keyword,location,text,target
count,7613.0,7552,5080,7613,7613.0
unique,,221,3341,7503,
top,,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...,
freq,,45,104,10,
mean,5441.934848,,,,0.42966
std,3137.11609,,,,0.49506
min,1.0,,,,0.0
25%,2734.0,,,,0.0
50%,5408.0,,,,0.0
75%,8146.0,,,,1.0



Valeurs manquantes : 


id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

Pourcentage de valeurs manquantes : 


id           0.000000
keyword      0.801261
location    33.272035
text         0.000000
target       0.000000
dtype: float64

In [55]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


---

Distribution des catastrophes réelles et non réelles

In [56]:
# Créer une copie pour l'affichage
data_plot = data_train.copy()
data_plot['target'] = data_plot['target'].map({
    0: "Pas de catastrophe",
    1: "Catastrophe"
})
fig = px.histogram(data_plot, x="target", color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_layout(title="Répartition de la réalité des catastrophes", yaxis_title="Nombre de tweets", xaxis_title="Cible", title_x=0.5)
fig.show()

---

In [57]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


---

Suppression de la colonne id

In [58]:
data_train = data_train.drop(columns=["id"])
data_train.head()

Unnamed: 0,keyword,location,text,target
0,,,Our Deeds are the Reason of this #earthquake M...,1
1,,,Forest fire near La Ronge Sask. Canada,1
2,,,All residents asked to 'shelter in place' are ...,1
3,,,"13,000 people receive #wildfires evacuation or...",1
4,,,Just got sent this photo from Ruby #Alaska as ...,1


---

keyword

In [59]:
data_train["keyword"].value_counts().head(10)


keyword
fatalities     45
deluge         42
armageddon     42
damage         41
body%20bags    41
harm           41
sinking        41
evacuate       40
outbreak       40
fear           40
Name: count, dtype: int64

---

location

In [60]:
data_train["location"].value_counts().head(20)

location
USA                104
New York            71
United States       50
London              45
Canada              29
Nigeria             28
UK                  27
Los Angeles, CA     26
India               24
Mumbai              22
Washington, DC      21
Kenya               20
Worldwide           19
Chicago, IL         18
Australia           18
California          17
New York, NY        15
California, USA     15
Everywhere          15
San Francisco       14
Name: count, dtype: int64

Garder uniquement ce qu'il y a avant la virgule

In [61]:
data_train["location"] = data_train["location"].str.split(",", n=1).str[0]

In [62]:
data_train["location"].value_counts().head(20)

location
USA              108
New York          95
London            70
United States     50
Washington        45
Los Angeles       45
California        38
Chicago           36
Nigeria           30
Canada            29
UK                29
Mumbai            27
San Francisco     25
India             24
Seattle           22
Calgary           21
Toronto           21
Kenya             20
Atlanta           20
Worldwide         19
Name: count, dtype: int64

---

target : renommer en "labels" pour le ML

In [63]:
data_train = data_train.rename(columns={"target": "labels"})

---

Répartition des variables qualitatives

In [64]:
cat_features = data_train.select_dtypes(include=["object", "bool"]).columns

for feature in cat_features:
    df_grouped = data_train[feature].value_counts().reset_index().head(40)
    df_grouped.columns = [feature, "count"]

    fig = px.bar(df_grouped, x=feature, y="count", color_discrete_sequence=px.colors.qualitative.Pastel)
    fig.update_layout(title=f"Répartition de {feature}", title_x=0.5,
        xaxis_title=feature, yaxis_title="Nombre d'occurences")
    fig.show()

---

Savegarde du fichier préprocessé

In [65]:
data_train.to_csv("../datas/train_clean.csv", index=False)