# RQ4

In [3]:
import pandas as pd

## Top Languages
First, we want to check in what languages most of the reviews are written in. We can do this by grouping the dataset by its `language` column, then by counting how many unique elements (languages) there are by using the `size()` method, and finally by sorting it and slicing the top 3.

In [222]:
def count_languages(df):
    """
    This functions counts how many reviews were written in each of the languages in the dataset
    and returns a list of the occurrences of each language
    
    Arguments
        df : pandas dataframe
    Returns
        list of tuples (language, num_of_reviews)
    """
    
    return([(lang, len(frame)) for lang, frame in df.groupby('language')['review_id']])

def sort_count(count, n = 3, reverse = True):
    """
    This functions sorts and slices a list of tuples
    e.g. [(a, 1), (b, 3), (c, 2)] —> [(b, 3), (c, 2), (a, 1)]
    
    Arguments
        count : list of tuples
    Returns
        sorted and sliced list of tuples
    """
    
    top = sorted(count, key = lambda x: x[1], reverse = reverse)
    
    return(top[:n])

def print_top_languages(top_languages):
    """
    This functions prints and formats the top languages
    and their respective number of reviews
    
    Arguments
        top_languages : list of tuples (e.g [('german', 80), ('french', 70), ('italian', 60)])
    Returns
        void
    """
    
    for lang, num in top_languages:
            print(f"{lang.capitalize()} with {num} reviews")

In [225]:
df = pd.read_csv("data/steam_reviews.csv", nrows = 1000000, header = 'infer')

top_languages = sort_count(count_languages(df))

### The most common languages are

In [226]:
print_top_languages(top_languages)

English with 392312 reviews
Russian with 185744 reviews
Schinese with 125667 reviews


### Now let's filter the dataset so it only includes reviews in these languages

In [228]:
def filter_by_language(df, languages):
    """
    This function filters the dataframe so it only contains reviews
    written in certain languages, and prints the percentage among these
    of those which were considered 'Funny' 
    and of those which were considered 'Helpful'
    
    Arguments:
        df        : pandas dataframe
        languages : list of languages (e.g ['russian', 'english', 'turkish'])
    Returns:
        f_df      : filtered pandas dataframe
    """
    
    f_df = df[df['language'].isin(languages)]
    
    for lang in languages:
        
        f_df_lang = f_df[f_df['language'] == lang]
        
        votes_funny = f_df_lang[f_df_lang['votes_funny'] > 0]['votes_funny'].value_counts().sum()  
        percent_funny = 100 * votes_funny / f_df_lang['votes_funny'].count()
        
        print(f"{percent_funny:.0f}% of the {lang.capitalize()} reviews were considered 'Funny'")

        votes_helpful = f_df_lang[f_df_lang['votes_helpful'] > 0]['votes_helpful'].value_counts().sum()
        percent_helpful = 100 * votes_helpful / f_df_lang['votes_helpful'].count()
        
        print(f"{percent_helpful:.0f}% of the {lang.capitalize()} reviews were considered 'Helpful'")
    
    return f_df

### How did other users consider these reviews: Funny or Helpful?

In [229]:
filtered_df = filter_by_language(df, [lang for lang, _ in top_languages])

9% of the English reviews were considered 'Funny'
25% of the English reviews were considered 'Helpful'
12% of the Russian reviews were considered 'Funny'
31% of the Russian reviews were considered 'Helpful'
10% of the Schinese reviews were considered 'Funny'
23% of the Schinese reviews were considered 'Helpful'


a few comments about the results when we use the whole dataset