In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm

In [2]:
path= "https://fullstackds-projects-bucket.s3.eu-west-3.amazonaws.com/data/glassdoor_reviews_cleaned.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,firm,date_review,overall_rating,work_life_balance,culture_values,career_opp,comp_benefits,senior_mgmt,recommend,ceo_approv,outlook,headline,pros,cons,recommend_score,status,experience
0,Apple,2008-01-31,5,3.0,5.0,3.0,4.0,4.0,v,v,o,We make products that poeple enjoy buying and ...,The people we work with are great and I can't ...,You have to be careful because this job can ta...,2.0,current,No information
1,Apple,2008-02-14,4,4.0,5.0,3.0,4.0,3.0,v,v,o,It is a different job because of all of the di...,The big positive for me is that I really do en...,I suppose that a con would be that there is a ...,2.0,current,No information
2,Apple,2008-02-15,5,4.0,5.0,5.0,5.0,5.0,v,v,o,Working at Apple is the hardest job you will e...,When you work at a place like Apple you have a...,The people at Apple are living on the bleeding...,2.0,current,No information
3,Apple,2008-04-20,5,5.0,5.0,5.0,4.0,5.0,v,v,o,"A retail job that is altogether exciting, thri...",Despite the fact that it all boils down to sel...,"As part of their intensive training, there are...",2.0,current,No information
4,Apple,2008-04-20,2,3.0,5.0,2.0,3.0,1.0,x,x,o,Worst choice ever.,The benefits for fulltime employees were great...,Too much confidentiality. You couldn't even f...,0.0,former,No information


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354200 entries, 0 to 354199
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   firm               354200 non-null  object 
 1   date_review        354200 non-null  object 
 2   overall_rating     354200 non-null  int64  
 3   work_life_balance  354200 non-null  float64
 4   culture_values     354200 non-null  float64
 5   career_opp         354200 non-null  float64
 6   comp_benefits      354200 non-null  float64
 7   senior_mgmt        354200 non-null  float64
 8   recommend          354200 non-null  object 
 9   ceo_approv         354200 non-null  object 
 10  outlook            354200 non-null  object 
 11  headline           352907 non-null  object 
 12  pros               354200 non-null  object 
 13  cons               354197 non-null  object 
 14  recommend_score    354200 non-null  float64
 15  status             354200 non-null  object 
 16  ex

In [4]:
df.isnull().sum().sort_values(ascending=False)

headline             1293
cons                    3
firm                    0
ceo_approv              0
status                  0
recommend_score         0
pros                    0
outlook                 0
recommend               0
date_review             0
senior_mgmt             0
comp_benefits           0
career_opp              0
culture_values          0
work_life_balance       0
overall_rating          0
experience              0
dtype: int64

### Text columns preprocessing

On utilise Spacy pour la lemmatisation et pour enlever les mots peu informatifs (stopwords).

Chaque champ (headline, pros, cons) est nettoyé séparément, ce qui nous permettra plus tard de faire une analyse différenciée si besoin.

Ensuite, on combine les versions nettoyées dans une colonne full_text_clean qu’on pourra utiliser pour une analyse globale du climat émotionnel (nuage de mots, clustering).

In [5]:
#pip install spacy -q

In [6]:
#pip install numpy==1.26.4

In [7]:
import re
import spacy
spacy.info()

{'spacy_version': '3.7.2',
 'location': 'C:\\Users\\user\\anaconda3\\envs\\environment\\lib\\site-packages\\spacy',
 'platform': 'Windows-10-10.0.26100-SP0',
 'python_version': '3.10.16',
 'pipelines': {'en_core_web_md': '3.7.1'}}

In [8]:
# Download model 
!python -m spacy download en_core_web_md -q

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


In [9]:
# Import English using en_core_web_sm.load()
import en_core_web_md
nlp = en_core_web_md.load()

In [10]:
# Function to clean text with RE

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text) # replace all \n, \r, \t and double spaces with a single space
    text = re.sub(r'\d+', '', text) # remove digitals
    text = re.sub(r'[^\w\s]', '', text) # remove punctuation symbols
    return text
    
# Wrap for batch processing with nlp.pipe    
def batch_clean_texts(texts, batch_size=1000):
    texts = [preprocess_text(t) for t in texts]
    docs = nlp.pipe(texts, batch_size=batch_size, disable=["ner", "parser"])  # disable unnecessary components
    cleaned = []
    for doc in docs:
        tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        cleaned.append(' '.join(tokens))
    return cleaned

# Progress bar
tqdm.pandas()

# Apply function to each text column
df['headline_clean'] = batch_clean_texts(df['headline'].fillna('').tolist())
df['pros_clean'] = batch_clean_texts(df['pros'].fillna('').tolist())
df['cons_clean'] = batch_clean_texts(df['cons'].fillna('').tolist())

# Combine all texts in one column
df['full_text_clean'] = df[['headline_clean', 'pros_clean', 'cons_clean']].agg('. '.join, axis=1)

# Show result
display(df[['headline_clean', 'pros_clean', 'cons_clean', 'full_text_clean']].head())

Unnamed: 0,headline_clean,pros_clean,cons_clean,full_text_clean
0,product poeple enjoy buy,people work great not imagine life will produc...,careful job life careful need learn seperate ...,product poeple enjoy buy. people work great no...
1,different job different creative product work,big positive enjoy job apple innovative change...,suppose con hard balance work home life,different job different creative product work....
2,work apple hard job love,work place like apple opportunity change way w...,people apple live bleed edge not know future h...,work apple hard job love. work place like appl...
3,retail job altogether exciting thrilling life ...,despite fact boil sell product apple make feel...,intensive training mandatory meeting month pos...,retail job altogether exciting thrilling life ...
4,bad choice,benefit fulltime employee great stock option r...,confidentiality not fart get reprimand way fee...,bad choice. benefit fulltime employee great st...


In [11]:

display(df.head())


Unnamed: 0,firm,date_review,overall_rating,work_life_balance,culture_values,career_opp,comp_benefits,senior_mgmt,recommend,ceo_approv,...,headline,pros,cons,recommend_score,status,experience,headline_clean,pros_clean,cons_clean,full_text_clean
0,Apple,2008-01-31,5,3.0,5.0,3.0,4.0,4.0,v,v,...,We make products that poeple enjoy buying and ...,The people we work with are great and I can't ...,You have to be careful because this job can ta...,2.0,current,No information,product poeple enjoy buy,people work great not imagine life will produc...,careful job life careful need learn seperate ...,product poeple enjoy buy. people work great no...
1,Apple,2008-02-14,4,4.0,5.0,3.0,4.0,3.0,v,v,...,It is a different job because of all of the di...,The big positive for me is that I really do en...,I suppose that a con would be that there is a ...,2.0,current,No information,different job different creative product work,big positive enjoy job apple innovative change...,suppose con hard balance work home life,different job different creative product work....
2,Apple,2008-02-15,5,4.0,5.0,5.0,5.0,5.0,v,v,...,Working at Apple is the hardest job you will e...,When you work at a place like Apple you have a...,The people at Apple are living on the bleeding...,2.0,current,No information,work apple hard job love,work place like apple opportunity change way w...,people apple live bleed edge not know future h...,work apple hard job love. work place like appl...
3,Apple,2008-04-20,5,5.0,5.0,5.0,4.0,5.0,v,v,...,"A retail job that is altogether exciting, thri...",Despite the fact that it all boils down to sel...,"As part of their intensive training, there are...",2.0,current,No information,retail job altogether exciting thrilling life ...,despite fact boil sell product apple make feel...,intensive training mandatory meeting month pos...,retail job altogether exciting thrilling life ...
4,Apple,2008-04-20,2,3.0,5.0,2.0,3.0,1.0,x,x,...,Worst choice ever.,The benefits for fulltime employees were great...,Too much confidentiality. You couldn't even f...,0.0,former,No information,bad choice,benefit fulltime employee great stock option r...,confidentiality not fart get reprimand way fee...,bad choice. benefit fulltime employee great st...


In [12]:
# Save to csv file
#df.to_csv('s3://fullstackds-projects-bucket/data/reviews_processed_with_spacy_md.csv', index=False)