# The purpose of this notebook is to preprocess the raw dataset to form suitable for classification task 

In [3]:
import os
import warnings
import pandas as pd
warnings.simplefilter(action='ignore', category=FutureWarning)





In [4]:
df = pd.read_csv("dataset.csv")

In [5]:
df.head()

Unnamed: 0,text,label,language
0,"!! sirup se slovem ""Goodness"" v nazvu je na 6 ...",0,cs
1,!KE!,2,cs
2,"""+"" Výkon dobrý. Bez sáčků. ""-"" Malý ""molitan""...",0,cs
3,"""Blikačka"" je klasická, výkonná. Dělalo mi pro...",1,cs
4,"""Boška"" je super lednice.Sice trochu dražší,al...",2,cs


**Function that removes urls and hashtags from all texts for better analysis**

In [6]:
def remove_urls(texts: pd.Series) -> pd.Series:
    '''
    Removes urls and hashtags from all texts in the given series (one-dimensional ndarray).
    
    Parameters
    ----------
    texts
        pd.Series (one-dimensional ndarray) of texts.
            
    Returns
    -------
    pandas.Series
        Series of articles stripped of urls.
    '''
    return texts.replace(r'http\S+', '', regex=True).replace(r'www\S+', ' ', regex=True).replace(r'#\S+', ' ', regex=True)

In [7]:
df['text'] = remove_urls(df['text'])

**Next, we remove sentences that are to short** 

In [8]:
def remove_short_texts(text_df: pd.DataFrame,max_len:int) -> pd.DataFrame:
    '''
    Removes texts that are shorter than max available length
    
    Parameters
    ----------
    text_df
            pandas.DataFrame with all texts
    max_len
    -------
            int that defines max len
    
    Returns
    -------
    pandas.DataFrame
            Dataframe striped of short texts
    '''

    text_df['len_cnt'] = text_df['text'].apply(lambda x: len(x))
    text_df = text_df[text_df['len_cnt'] > max_len]
    return text_df.drop('len_cnt',axis=1)

In [9]:
df = remove_short_texts(df,139)

In [10]:
df['language'].value_counts()

zh    30985
cs    29325
es     8051
ru     2964
de     2480
en     1495
Name: language, dtype: int64

**Finally, make all lanugages equal size in dataframe**

In [11]:
min_size = min(list(df['language'].value_counts()))

In [12]:
df['language'].unique()

array(['cs', 'de', 'en', 'es', 'ru', 'zh'], dtype=object)

In [13]:
def balance_languages(text_df: pd.DataFrame,max_size:int)->pd.DataFrame:
    new_df = pd.DataFrame()
    for lang in df['language'].unique():
        category_data = df[df['language'] == lang]
        sampled_data = category_data.sample(n=max_size, random_state=42)
        new_df = pd.concat([new_df, sampled_data], ignore_index=True)

    return new_df

    

In [14]:
df = balance_languages(df,min_size)

In [15]:
df['language'].value_counts()

cs    1495
de    1495
en    1495
es    1495
ru    1495
zh    1495
Name: language, dtype: int64

**All languages are balanced now. We can save the dataset to csv file.**

In [17]:
df.to_csv("lng_final_dataset.csv",index=False)