In [5]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

In [6]:
import plotly
import plotly.express as px
import plotly.offline as py
import plotly.graph_objs as go
import os

In [7]:
# Read recipe inputs
aspects_grouped = dataiku.Dataset("tweepy_aspect_sentiment_categorised")
df = aspects_grouped.get_dataframe()

In [8]:
def weighted_ave(x):
    d = {}
    d['weighted_ave_nltk'] = (x["mean_polarity_nltk"] * x["count"]).sum() / x["count"].sum()
    d['weighted_ave_tb'] = (x["mean_polarity_textblob"] * x["count"]).sum() / x["count"].sum()
    return pd.Series(d, index=['weighted_ave_nltk', 'weighted_ave_tb'])

In [9]:
df_grouped = df.groupby(["product_id", "group"]).apply(weighted_ave).reset_index()
df_clustered = df.groupby(["product_id", "k_means_clusters"]).apply(weighted_ave).reset_index()


arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.



In [10]:
# Compute recipe outputs from inputs
# TODO: Replace this part by your actual code that computes the output, as a Pandas dataframe
# NB: DSS also supports other kinds of APIs for reading and writing data. Please see doc.

sentiment_by_companies_df = df_grouped # For this sample code, simply copy input to output

In [11]:
# Write recipe outputs
sentiment_by_companies = dataiku.Dataset("tweepy_analysis_by_companies")
sentiment_by_companies.write_with_schema(df_grouped)

KM_analysis_by_companies = dataiku.Dataset("tweepy_KM")
KM_analysis_by_companies.write_with_schema(df_clustered)

34 rows successfully written (zfYc7w0RZm)
28 rows successfully written (FKqrZdvo6U)


In [12]:
from dataiku import insights
for name in df_grouped.product_id.unique():
    df_sub = df_grouped[df_grouped.product_id == name]
    groups = df_sub.group.unique()
    fig = go.Figure(data=[
        go.Bar(name='NLTK', x=groups, y=df_sub.weighted_ave_nltk),
        go.Bar(name='TextBlob', x=groups, y=df_sub.weighted_ave_tb)])
    # Change the bar mode
    fig.update_layout(barmode='group', title_text=name)

    insights_name = name.replace(" ", "_") + "_grouped"
    folder_path = dataiku.Folder("tweepy_plots").get_path()
    fig_path = os.path.join(folder_path, insights_name)
    fig.write_html(fig_path+".html")
    insights.save_plotly(insights_name, fig)

In [14]:
df

Unnamed: 0,product_id,noun_lemmatized,count,mean_polarity_nltk,mean_polarity_textblob,review_id,k_means_clusters,group
0,Air France,1,1,-0.102700,-0.291667,[1255255206620811264],3,punctuality
1,Air France,100,2,0.510600,0.400000,[1243137119138144258],3,punctuality
2,Air France,100k,1,0.000000,0.000000,[1256202104693284865],2,luggage
3,Air France,10th march 2020,2,0.000000,0.000000,[1243403788305584131],0,luggage
4,Air France,11th april,2,0.000000,0.000000,[1243141859163213824],0,punctuality
5,Air France,1200,1,0.000000,0.000000,[1255058629822119937],2,punctuality
6,Air France,12m,2,0.000000,0.000000,[1242981892883918848],3,punctuality
7,Air France,12th march,2,0.000000,0.000000,[1243093929328873472],0,luggage
8,Air France,12th march travel date 13th,2,0.000000,0.000000,[1243070888959864832],0,punctuality
9,Air France,14,1,0.000000,0.285714,[1245304941050298368],3,punctuality


In [13]:
df_grouped

Unnamed: 0,product_id,group,weighted_ave_nltk,weighted_ave_tb
0,Air France,company,-0.019328,-0.009776
1,Air France,food,-0.01145,0.017143
2,Air France,luggage,0.036694,0.040862
3,Air France,punctuality,0.135852,0.206144
4,Air France,staff,-0.000112,0.006727
5,Delta Air Lines,company,-0.018013,-0.017933
6,Delta Air Lines,food,-0.034629,0.020441
7,Delta Air Lines,luggage,0.024258,0.029211
8,Delta Air Lines,punctuality,0.098232,0.152042
9,Delta Air Lines,staff,0.094009,0.11024
