### Format for dashboard:
-

In [172]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

In [173]:
df_links = pd.read_csv('data/links_luxobelg.csv', index_col=0)

In [174]:
def get_csvs_as_dict(df_links):
    reviews = {}
    for country in ['belgian', 'lux']:
        reviews[country] = {}
        for company in df_links['company']:
            try:
                reviews[country][company] = pd.read_csv(f'data/raw_reviews/{company}_{country}_reviews.csv', index_col=0)
            except:
                continue
    return reviews

In [175]:
reviews_dict = get_csvs_as_dict(df_links)

In [176]:
def split_date_place_job(df_reviews):
    """
    Expect a single dataframe of reviews with a column date_and_job and returns the dataframe
    with that column split into 2, date, job_city
    """
    date_jobloc = pd.DataFrame()
    
    date_and_job = df_reviews.pop('date_and_job')
    date_jobloc = date_and_job.str.split(' - ', n=1, expand=True)
    date_jobloc.columns = ['date', 'job_location']
    
    df = pd.concat([date_jobloc, df_reviews], axis=1)
    return df

In [177]:
def format_date(date):
    """
    Function to be mapped
    Expect date string like May 6, 2022 and convert it to datetime object
    """
    month_name = date[:3]
    datetime_object = datetime.strptime(month_name, "%b")
    month_number = datetime_object.month
    date_numbers = str(month_number)+','+date[4:]
    date_numbers = date_numbers.replace(' ', '')
    formatted_date = datetime.strptime(date_numbers, '%m,%d,%Y')
    return formatted_date

In [178]:
def split_date_to_datetime(reviews_dict):
    for country in reviews_dict:
        for company in reviews_dict[country]:
            reviews_dict[country][company] = split_date_place_job(reviews_dict[country][company])  
            reviews_dict[country][company]['date'] = reviews_dict[country][company]['date'].map(format_date)
    return reviews_dict

In [179]:
splitted_date = split_date_to_datetime(reviews_dict)

In [180]:
star_columns = ['Work/Life Balance', 'Culture & Values', 'Diversity & Inclusion', 'Career Opportunities', 'Compensation and Benefits', 'Senior Management']

In [181]:
def stars_to_int(row):
    """
    Function to be mapped on columns with ratings as stars
    """
    if row == '*****':
        return 5
    elif row == '****':
        return 4
    elif row == '***':
        return 3
    elif row == '**':
        return 2
    elif row == '*':
        return 1
    else:
        return np.nan

In [182]:
def clean_dfs(dfs):
    for country, df_country in dfs.items():
        for company, df_company in df_country.items():
            if isinstance(df_company, float):
                continue
            else:
                df_company.set_index('date', inplace=True)
                df_company.sort_index(inplace=True)
                for star_column in star_columns:
                    df_company[star_column] = df_company[star_column].map(stars_to_int)
                
    return dfs

In [183]:
clean_dfs = clean_dfs(splitted_date)

In [184]:
def add_rolling_means(dfs):
    for country, df_country in dfs.items():
        for company, df_company in df_country.items():
            if isinstance(df_company, float):
                continue
            else:
                for star_column in star_columns:
                    df_company[f'{star_column}_rolling_10mean'] = df_company[star_column].rolling(10, min_periods=1).mean(skipna=True)
    return dfs

In [185]:
dfs_finished = add_rolling_means(clean_dfs)

In [189]:
def combine_dfs(dfs):
    columns = list(pd.DataFrame(dfs).iloc[0][0].columns)
    columns.extend(['country', 'company'])
    combined_reviews = pd.DataFrame(columns = columns)
    for country, df_country in dfs.items():
        for company, df_company in df_country.items():
            if isinstance(df_company, float):
                continue
            else:
                df_company['country'] = country
                df_company['company'] = company
                combined_reviews = combined_reviews.append(dfs[country][company])
    #combined_reviews.set_index('date', inplace=True)
    return combined_reviews

In [190]:
combined_dfs = combine_dfs(dfs_finished)

In [192]:
#combined_dfs.to_csv('data/combined_reviews.csv')

### Text processing

In [41]:
for countries in reviews.values():
    for company in countries.values():
        print(len(company))

137
13
97
24
36
52
39
229
122
68
38
234
190
44
139
262
18


In [52]:
from gensim.utils import simple_preprocess

processed = []
# iterate over rows
for i, text in enumerate(reviews['belgian']['deloitte']['pros']):
    document = simple_preprocess(text) # get sentences/tokens
    processed.append(document) # add to list

In [53]:
len(processed)

229

In [54]:
#Check for correlation between sentence length and scores, although probably not really informative
for pro in processed:
    print(len(pro))

7
9
6
5
4
7
11
7
22
14
5
17
8
32
9
60
5
5
9
7
5
5
7
26
7
6
38
7
5
6
6
10
7
20
9
7
6
8
8
5
6
5
6
9
10
5
8
6
5
12
5
11
5
12
21
6
23
4
5
7
9
7
8
12
5
6
20
7
10
5
6
11
15
42
8
7
5
8
5
8
6
14
5
11
16
5
6
5
11
8
7
9
5
10
8
35
6
6
17
7
5
8
8
11
7
22
6
6
6
9
5
5
13
6
8
8
19
5
28
9
5
6
6
3
6
7
20
6
22
4
4
7
6
79
9
8
10
5
5
35
23
6
8
5
5
12
32
13
9
9
18
9
18
7
8
8
8
14
14
5
13
5
10
10
13
22
4
14
5
55
6
9
14
11
82
4
10
8
14
7
8
19
14
10
7
7
19
24
18
7
19
5
26
26
5
10
8
18
6
35
39
42
23
20
5
7
14
27
23
20
39
19
33
19
21
9
5
5
10
12
36
17
27
8
20
5
29
19
17
