# Dataset & Libraries 

In [None]:
import plotly.graph_objects as go
from wordcloud import WordCloud
import pandas as pd
import matplotlib.pyplot as plt
import re 
import nltk
nltk.download('stopwords')
from tqdm import tqdm
import unicodedata

In [None]:
train_df = pd.read_json("train.json").set_index('Id')
test_df = pd.read_json("test.json").set_index('Id')
train_label = pd.read_csv("train_label.csv").set_index('Id')
categories_df= pd.read_csv("categories_string.csv")

In [None]:
data_train=pd.concat([train_df,train_label],axis=1)

# Plots

In [None]:
data_count = train_label["Category"].value_counts()

fig = go.Figure(data=[go.Bar(x=data_count.index, y=data_count.values)])
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)
fig.update_layout(xaxis_tickangle=-45,title='Distribution of jobs within categories',xaxis=dict(
        title='Category',
        tickmode='linear'),yaxis=dict(title='Number of jobs'))

fig.show()

In [None]:
#TRAIN
#nombre de femmes
female=train_df[train_df["gender"]=="F"]
fem=female["gender"].count()
#nombre total de lignes
tot=train_df["gender"].count()
prop_women=fem/tot
#nombre d'hommes
male=train_df[train_df["gender"]=="M"]
ma=male["gender"].count()
prop_male=ma/tot

In [None]:
labels = ['Female','Male']
values = [prop_women,prop_male]

fig = go.Figure(data=[go.Pie(labels=labels, values=values, textinfo='label+percent',
insidetextorientation='radial',title="Gender distribution in train set"
)])
fig.show()

# Wordcloud

## 1) Before cleaning

In [None]:
all_descr = " ".join(data_train.description.values)
wordcloud_word = WordCloud(background_color="white", collocations=False).generate_from_text(all_descr)

plt.figure(figsize=(10,10))
plt.imshow(wordcloud_word,cmap=plt.cm.Paired)
plt.axis("off")
plt.show()

## 2) After cleaning

In [None]:
from clean import CleanText
ct = CleanText()

In [None]:
ct.clean_df_column(data_train, "description","description_cleaned")

In [None]:
all_descr_clean_stem = " ".join(data_train.description_cleaned.values)
wordcloud_word = WordCloud(background_color="white", collocations=False).generate_from_text(all_descr_clean_stem)

plt.figure(figsize=(10,10))
plt.imshow(wordcloud_word,cmap=plt.cm.Paired)
plt.axis("off")
plt.show()