# This first notebook is about transforming the json data into a better to handle csv format

In [None]:
import pandas as pd
import json
import fasttext as ft

In [None]:
# define function to reed json
def get_df(fn, limit=None):
    json_lines = []
    line_nr = 1
    with open(fn) as f:
        for line in f:
            if limit and line_nr == limit:
                break
            json_line = json.loads(line)
            json_lines.append(json_line)
            line_nr += 1
    df = pd.DataFrame(json_lines)
    return df

In [None]:
#read the full json
dfr = get_df('../data/yelp_academic_dataset_review.json')

In [None]:
# save full file as csv
dfr.to_csv('../data/review.csv')

## As the Yelp Datasaet contains nearly 8 Million reviews, we will reduce the dataset to only the last two years 2018 and 2019

In [None]:
# add year column and filter by it
dfr['year'] = pd.to_datetime(dfr.date).dt.year
df = dfr.query('2018 <= year <= 2019')

In [None]:
# save reduced df
df.to_csv('../data/review_1819.csv', index=False)

## Lastly we will remove all the non englisch reviews, as we want to perform several steps of NLP, that prefferably work on a dataset with only one language

In [None]:
# copy dataframe to be sure
df_lang = df

# Load pretrained model
fasttext_model = ft.load_model('../data/lid.176.bin')

# Initiate empty language list
language_list = []

accuracy = 0.95
for row in df_lang['text']:
    row = row.replace("\n"," ")                                     # replace \n with " "
    label = fasttext_model.predict(row, k=-1, threshold=accuracy)   # predict language per row with a certainty of at least 95%
    language_list.append(label)                                     # append result to list

# Set language list as new column in dataframe

language_df = pd.DataFrame(language_list, columns=['language', 'probability'])
df_lang['language'] = language_df['language'].astype(str)
df_lang = df[df['language'] == "('__label__en',)"];

In [None]:
# Drop the new language column and save to csv

df_lang.drop('language', axis=1, inplace=True)
df_lang.to_csv('../data/review_1819_eng.csv', index=False)