In [2]:
import slo_twitter_data_analysis_utility_functions_v2 as tweet_util_v2

# Import CSV dataset and convert to dataframe.
tweet_csv_dataframe = tweet_util_v2.import_dataset(
    "D:/Dropbox/summer-research-2019/jupyter-notebooks/attribute-datasets/twitter-dataset-6-22-19-test.csv",
    "csv", False)

# SLO Twitter Data Analysis  - Character Count Statistics

<span style="font-family:Papyrus; font-size:1.25em;">

This function provides character counts for all the Tweets associated with a given company.  We visualize this statistic by plotting a relative frequency histogram of those counts for the CSV dataset.<br>

</span>

In [None]:
def tweet_character_counts(tweet_dataframe):
    """
    Character count for Tweet text by associated company related statistics and visualizations.

    Note: The raw JSON file does not have associated "company" information.

    :param tweet_dataframe: the Twitter dataset in a Pandas dataframe.
    :return: None.
    """
    # Select only rows with one associated company. (don't graph company combos)
    single_company_only_df = tweet_dataframe.loc[tweet_dataframe['multiple_companies_derived_count'] == 1]

    print("Character Count Statistics of Tweet Text for CSV dataset by Company: ")
    print("Character Count Relative Frequency Histogram: ")
    plt.figure()
    grid = sns.FacetGrid(tweet_dataframe[['text_derived', 'company_derived_designation']],
                         col='company_derived_designation', col_wrap=6,
                         ylim=(0, 1))
    grid.map_dataframe(tweet_util.relhist_proc, 'text_derived', bins=10, proc=tweet_util.char_len).set_titles(
        '{col_name}').set_xlabels("# of Characters").set_ylabels("Percentage of all Tweets")
    plt.show()
    
    print("Character Count Statistics of User Description Text for CSV dataset by Company: ")
    print("Character Count Relative Frequency Histogram: ")
    plt.figure()
    grid = sns.FacetGrid(tweet_dataframe[['user_description', 'company_derived_designation']],
                         col='company_derived_designation', col_wrap=6,
                         ylim=(0, 1))
    grid.map_dataframe(tweet_util.relhist_proc, 'user_description', bins=10, proc=tweet_util.char_len).set_titles(
        '{col_name}').set_xlabels("# of Characters").set_ylabels("Percentage of all Tweets")
    plt.show()

<span style="font-family:Papyrus; font-size:1.25em;">

We call out data analysis function and pass in the CSV dataset imported into a Pandas dataframe per usual.<br>

</span>

In [None]:
    # Determine the # of characters in Tweets via relative frequency histogram.
    tweet_character_counts(tweet_csv_dataframe)

<span style="font-family:Papyrus; font-size:1.25em;">

The graph outputs appear to show that most Tweets for any given company are relatively long in length.<br>

TODO - see if foreign (non-English) Tweets are responsible for extremely long Tweets due to encoding isdsues.<br>

</span>

In [None]:
    character_length = 140
    long_tweets = tweet_csv_dataframe.loc[tweet_csv_dataframe["tweet_text_length_derived"] > character_length]
    print(f"The number of Tweets over {character_length} is {long_tweets.shape[0]}")

    long_description = tweet_csv_dataframe.loc[tweet_csv_dataframe["user_description_text_length"] > character_length]
    print(f"The number of user descriptions over {character_length} is {long_description.shape[0]}")

<span style="font-family:Papyrus; font-size:1.25em;">
    
**TODO: refactor into function in data analysis file**

</span>

### Hashtag Statistics for Twitter dataset:

<span style="font-family:Papyrus; font-size:1.25em;">

This funciton computes hashtag statistics.

</span>

In [None]:
def hashtags(tweet_dataframe):
    """
    Hashtag related statistics and visualizations.

    :param tweet_dataframe: the Twitter dataset in a dataframe.
    :return: None.

    FIXME - graphs function; text stats non-functional (TypeError: 'float' object is not iterable)
    """
    # Select only rows with one associated company. (don't graph company combos)
    single_company_only_df = tweet_dataframe.loc[tweet_dataframe['multiple_companies_derived_count'] == 1]

    # the number of hashtags within tweets
    print(f"The Number of Hashtags within Tweets:")
    tweet_dataframe['#hashtags'] = single_company_only_df['tweet_entities_hashtags'].apply(
        lambda x: len(x) if x is not None and not isinstance(x, float) else 0)
    # companies = df['company']

    plt.figure()
    grid = sns.FacetGrid(tweet_dataframe[['#hashtags', 'company_derived_designation']],
                         col='company_derived_designation', col_wrap=6,
                         ylim=(0, 1),
                         xlim=(-1, 10))
    grid.map_dataframe(tweet_util.bar_plot, '#hashtags').set_titles('{col_name}') \
        .set_xlabels("# of Hashtags").set_ylabels("Percentage of All Tweets?")
    plt.show()

    # # top hashtags
    # single_company_only_df[['company_derived', 'tweet_entities_hashtags']].groupby('company_derived') \
    #     .apply(lambda x: pd.Series([hashtag
    #                                 for hashtags in x['tweet_entities_hashtags'] if hashtags is not None
    #                                 for hashtag in hashtags]) \
    #            .value_counts(normalize=True) \
    #            .head())
    #
    # # top hashtags, lower-cased
    # single_company_only_df[['company_derived', 'tweet_entities_hashtags']].groupby('company_derived') \
    #     .apply(lambda x: pd.Series([hashtag.lower()
    #                                 for hashtags in x['tweet_entities_hashtags'] if hashtags is not None
    #                                 for hashtag in hashtags]) \
    #            .value_counts(normalize=True) \
    #            .head())

<span style="font-family:Papyrus; font-size:1.25em;">

Call the data analysis function.  We limit outselves to rows in our dataframe where Tweets are only associated with a single company.<br>

</span>

In [None]:
    # Hashtag Statistics.
    hashtags(tweet_csv_dataframe)

<span style="font-family:Papyrus; font-size:1.25em;">

TODO - understand what the output is producing.<br>

</span>

### Mentions Statistics for Twitter dataset:

<span style="font-family:Papyrus; font-size:1.25em;">

This function computes user mentions statistics.<br>

</span>

In [None]:
def mentions(tweet_dataframe):
    """
    Mentions related statistics and visualizations.

    :param tweet_dataframe: the Twitter dataset in a dataframe.
    :return: None.
    """
    # Select only rows with one associated company. (don't graph company combos)
    single_company_only_df = tweet_dataframe.loc[tweet_dataframe['multiple_companies_derived_count'] == 1]

    print(f"tweet_entities_user_mentions_id count divided by length of tweet_dataframe")
    print(tweet_dataframe['tweet_entities_user_mentions_id'].count() / len(tweet_dataframe))

    print(f"tweet_in_reply_to_status_id count divided by length of tweet_dataframe")
    print(tweet_dataframe['tweet_in_reply_to_status_id'].count() / len(tweet_dataframe))

    # the number of mentions within tweets
    print(f"\nThe number of Mentions within the Tweets:")
    tweet_dataframe['#mentions'] = tweet_dataframe['tweet_entities_user_mentions_id']. \
        apply(lambda x: len(x) if isinstance(x, list) else 0)

    plt.figure()
    grid = sns.FacetGrid(tweet_dataframe[['#mentions', 'company_derived_designation']],
                         col='company_derived_designation', col_wrap=6,
                         ylim=(0, 1),
                         xlim=(-1, 10))
    grid.map_dataframe(tweet_util.bar_plot, '#mentions').set_titles('{col_name}') \
        .set_xlabels("Number of Mentions").set_ylabels("Percentage of All Tweets?")
    plt.show()

    # top mentions
    print(f"Top (Most) Mentions for a Company by User Mentions ID")
    print(
        tweet_dataframe[['company_derived_designation', 'tweet_entities_user_mentions_id']].groupby(
            'company_derived_designation') \
            .apply(lambda x: pd.Series([mention
                                        for mentions in x['tweet_entities_user_mentions_id'] if mentions is not None
                                        for mention in mentions]) \
                   .value_counts(normalize=True) \
                   .head()))

<span style="font-family:Papyrus; font-size:1.25em;">

Call the data analysis function.  We limit outselves to rows in our dataframe where Tweets are only associated with a single company.<br>

</span>

In [None]:
    # Mentions Statistics.
    mentions(tweet_csv_dataframe)

<span style="font-family:Papyrus; font-size:1.25em;">

TODO - understand what the output is producing.<br>

</span>