<a href="https://colab.research.google.com/github/HenryBlairG/IIC2154-DiagnosticoGitFlow/blob/feature%2Fsetup_py-project/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Main File to run Twitter Top 10's
***

## 1. Librerias Necesarias

In [1]:
import json as js
import pandas as pd

## 2. Funciones de Utilidad

In [2]:
def load_data(path: str='dataset.json') -> pd.DataFrame:
    '''
    load_data Load dataset from specified path

    Load data from path and preprocess for application consumption by removing unnecesary information

    Args:
        path (str, optional): path where data is saved. Defaults to 'dataset.json'.

    Returns:
        pd.DataFrame: dataset with necessary information for consumption
    '''
    with open(path) as dataset_raw:
        df = pd.DataFrame.from_records(js.loads(l) for l in dataset_raw)
        df = df[[
            'date',         # For 10 Best dates with most tweets
            'content',      # For 10 Hashtags most used
            'user',         # For 10 Users with most tweets
            'retweetCount'  # For 10 Tweets most retweeted
        ]]
        return df

# load_data().head(5)

## 3. Rankings

### 3.1 Los top 10 tweets más retweeted

In [3]:
def tweets_most_retweeted(qty: int=10, df: pd.DataFrame=None) -> bool:
    '''
    tweets_most_retweeted table of content and retweets count

    Sort DataFrame by retweets count and display the content of the specified best

    Args:
        qty (int, optional): number of tweets to be displayed. Defaults to 10.
        df (pd.DataFrame, optional): data source. Defaults to None.

    Returns:
        bool: Flow control, True if everything works correctly
    '''
    top_qty = df.sort_values('retweetCount', ascending=False).head(qty)
    print(top_qty[['content', 'retweetCount']])
    return True

# tweets_most_retweeted(df=load_data())


### 3.2 Los top 10 usuarios en función a la cantidad de tweets que emitieron

In [4]:
def users_most_tweet_count(qty: int=10, df: pd.DataFrame=None) -> bool:
    '''
    users_most_tweet_count table of users and tweets count

    Group Dataframe by users and sort by tweets count and display username and tweets count 

    Args:
        qty (int, optional): number of users to be displayed. Defaults to 10.
        df (pd.DataFrame, optional): data source. Defaults to None.

    Returns:
        bool: Flow control, True if everything works correctly
    '''
    users_df = df[['user', 'content']].copy(deep=True)
    users_df['username'] = users_df.user.map(lambda x: x['username'])
    users_df['num_tweets'] = users_df['content']
    users_df = users_df[['username', 'num_tweets']]
    users_df = users_df.groupby('username').count()
    users_df = users_df.sort_values('num_tweets', ascending=False)
    print(users_df.head(10))
    return True

# users_most_tweet_count(df=load_data())

### 3.3 Los top 10 días donde hay más tweets

In [5]:
def dates_most_tweet_count(qty: int=10, df: pd.DataFrame=None) -> bool:
    '''
    users_most_tweet_count table of dates and tweets count

    Group Dataframe by users and sort by tweets count and display username and tweets count 

    Args:
        qty (int, optional): number of dates to be displayed. Defaults to 10.
        df (pd.DataFrame, optional): data source. Defaults to None.

    Returns:
        bool: Flow control, True if everything works correctly
    '''
    dates_df = df[['date', 'content']].copy(deep=True)
    dates_df['num_tweets'] = dates_df['content']
    dates_df = dates_df[['date', 'num_tweets']]
    dates_df = dates_df.groupby('date').count()
    dates_df = dates_df.sort_values('num_tweets', ascending=False)
    print(dates_df.head(10))
    return True

# dates_most_tweet_count(df=load_data())

### 3.4 Los Top 10 hashtags más usados

In [6]:
def hashtags_most_tweeted(qty: int=10, df: pd.DataFrame=None) -> bool:
    '''
    hashtags_most_tweeted table of hashtags and tweets count

    Sort DataFrame by tweets count and display the hashtags of the specified best

    Args:
        qty (int, optional): number of hashtags to be displayed. Defaults to 10.
        df (pd.DataFrame, optional): data source. Defaults to None.

    Returns:
        bool: Flow control, True if everything works correctly
    '''
    new_df = pd.DataFrame()
    new_df['hashtags'] = df['content'].str.extractall(r'#(\w+)').\
        reset_index(level=0).drop_duplicates()[0].value_counts()

    print(new_df.head(qty))
    return True

hashtags_most_tweeted(df=load_data())

                          hashtags
FarmersProtest              400365
IStandWithFarmers            15745
farmersprotest               15394
IndianFarmersHumanRights     11859
FarmersAreIndia              10947
StandWithFarmers             10577
Rihanna                       9023
FarmersProtests               8714
Farmers                       6413
shameonbollywood              6166


True

## 4. Funcion Principal

In [7]:
def main():
    '''
    main executes selected option

    display options and executes validated option
    '''
    options = dict(
        top10retweets=tweets_most_retweeted,
        top10users=users_most_tweet_count,
        top19dates=dates_most_tweet_count,
        exit=exit
    )
    
    res = True
    while res:
        for i, k in enumerate(options.keys()):
            print(f'{i+1}. {k}')
        
        option = input('Seleccione Una Opción: ')
        while option not in options:
            option = input('Opcion no Encontrada. Seleccione Una Opción: ')
        
        ex = options.get(option, lambda: False)
        res = ex()
        res_string = f'Resultado:\n{res}' if res else ''
        
        print(f'{res_string}\n{"=" *100}')

## 5. Ejecución

In [8]:
if __name__ == '__main__':
    main()

1. top10retweets
2. top10users
3. top19dates
4. exit

