# EXPLORATORY ANALYSIS

In [None]:
from google.colab import files
files.upload()

In [None]:
# libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


## ANALYSIS OF THE DATASET

In [None]:
# load dataset
df = pd.read_csv('reviews_dataset.csv')


In [None]:
df.columns

In [None]:
# view columns
df.columns
# first column should be index, we
df.set_index('Unnamed: 0', inplace = True)
# set index names as ID
df.index.name = 'ID'
# rename columns to make them pythonic (i.e. replace spaces with underscore)
# this allows to access them with df.name easily
new_column_naming = {}
for col in df.columns:
  new_col = col.replace(' ', '_').lower()

  new_column_naming[col] = new_col
df.rename(new_column_naming, axis = 1, inplace = True)
# additionally, rename the 'at' column since 'at' is a method in pandas
df.rename({'at':'time'}, axis = 1, inplace = True)
# lastly, rename the column 'retouch (face/body)' to make it machine readable
df.rename({'retouch_(face/body)' : 'retouch_face_or_body'}, axis = 1, inplace = True)


In [None]:
# basic description of dataset
df.describe()

In [None]:
# basic description of dataset
df.info()

In [None]:
# preliminary unsatisfactory identification of numerical and categorical_variables
numerical_columns = df._get_numeric_data().columns.tolist()
categorical_columns = list(set(df.columns) - set(numerical_columns))
print('found {} categorical variables'.format(len(df.columns) - len(numerical_columns)))

In [None]:
# first of all we convert the 'time' column into a datetime format
df.time = pd.to_datetime(df['time'])
timestamp_columns = ['time']
categorical_columns.remove('time')


In [None]:
numerical_columns, categorical_columns

## PRELIMINARY GRAPHICAL DATA ANALYSIS

In [None]:
# in this subsection, we aim to explore the dataset to see potential clues in the data
# without assuming a model

In [None]:
# create a local directory for images
! mkdir imgs

In [None]:
# we should just keep score_x of this
df[categorical_columns].hist(figsize = (20, 20))
plt.title('Categorical Columns Reviews Dataframe')
plt.savefig('imgs/histogram_categorical_cols.png')

In [None]:
ax = df.score_x.hist()
ax.set_title('Histogram of Reviews')
plt.savefig('imgs/histogram_reviews.png')

In [None]:
# this is too big, basically useless as they are all binary
# sns.pairplot(df[categorical_columns])
# plt.savefig('imgs/pairplot_categorical_cols.png')

In [None]:
sns.pairplot(df[numerical_columns])
plt.suptitle('Pair Plot Numerical Columns Reviews Dataframe', y = 1.05)
plt.savefig('imgs/pairplot_numerical_columns.png')

In [None]:
sns.pairplot(df[numerical_columns + ['score_x']], hue = 'score_x')
plt.suptitle('Review rating Pair Plot Numerical Columns Reviews Dataframe', y = 1.05)
plt.savefig('imgs/hued_pairplot_numerical_columns.png')

In [None]:
for col in numerical_columns:
  sns.displot(kind='kde', data=df[col])
  plt.title('Density Plot Numerical Columns Reviews Dataframe')
  plt.savefig('imgs/density_plot_' + col)

In [None]:
sns.boxplot(x="google_play", y='score_x', data=df)
plt.title('Box Plot Store vs Rating Reviews Dataframe')

plt.savefig('imgs/boxplot_google_play_vs_score_x.png')

In [None]:
ax = sns.boxplot(x="google_play", y='score_x', data=df)
ax = sns.stripplot(x="google_play", y="score_x", data=df, jitter=True, edgecolor="gray")
ax.set_title('Box Strip Plot Store vs Rating Reviews Dataframe')

plt.savefig('imgs/boxplot_stripped_google_play_vs_score_x.png')


In [None]:
sns.violinplot(x="google_play", y='score_x', data=df)
plt.title('Violin Plot Store vs Rating Reviews Dataframe')
plt.savefig('imgs/violinplot_google_play_vs_score_x.png')

In [None]:
sns.boxplot(x="score_x", y='score_y', data=df)
plt.title('Box Plot Rating vs Avg. Rating Reviews Dataframe')

plt.savefig('imgs/boxplot_score_x_vs_score_y.png')

In [None]:
ax = sns.boxplot(x="score_x", y='score_y', data=df)
ax.set_title('Box Strip Plot Rating vs Avg. Rating Reviews Dataframe')
ax = sns.stripplot(x="score_x", y="score_y", data=df, jitter=True, edgecolor="gray")
plt.savefig('imgs/boxplot_stripped_score_x_vs_score_y.png')

In [None]:
sns.violinplot(x="score_x", y='score_y', data=df)
plt.title('Violin Plot Rating vs Avg. Rating Reviews Dataframe')

plt.savefig('imgs/violinplot_score_x_vs_score_y.png')

In [None]:
sns.heatmap(df[numerical_columns].corr())
plt.title('Correlation Matrix Numerical Columns Reviews Dataframe')
plt.savefig('imgs/correlation_matrix_numerical_columns.png')

In [None]:
sns.heatmap(df.corr())
plt.title('Correlation Matrix Reviews Dataframe')
plt.savefig('imgs/correlation_matrix.png')

In [None]:
pd.plotting.andrews_curves(df.drop(timestamp_columns + text_columns, axis=1), "score_x")
plt.title('Andrews Curves of single Review')
plt.savefig('imgs/andrews_review.png')
pd.plotting.parallel_coordinates(df.drop(timestamp_columns + text_columns, axis=1), "score_x")
plt.title('Parallel Coordinates of single Review')

plt.savefig('imgs/parallel_coordinates_review.png')

pd.plotting.radviz(df.drop(timestamp_columns + text_columns, axis=1), "score_x")
plt.title('Radial Visualization of single Review')

plt.savefig('imgs/radvix_review.png')


In [None]:
pd.plotting.andrews_curves(df[numerical_columns + ['score_x']], "score_x")
plt.title('Numerical Columns Andrews Curves of single Review')
plt.savefig('imgs/numerical_andrews_review.png')

In [None]:
pd.plotting.parallel_coordinates(df[numerical_columns + ['score_x']], "score_x")
plt.title('Numerical Columns Parallel Coordinates of single Review')

plt.savefig('imgs/numerical_parallel_coordinates_review.png')



In [None]:
pd.plotting.radviz(df[numerical_columns + ['score_x']], "score_x")
plt.title('Numerical Columns Radial Visualization of single Review')

plt.savefig('imgs/numerical_radviz_review.png')


# STATISTICAL ANALYIS
## DATASET MANIPULATION
we try to perform some starting analysis with apps without review text. This means that we can basically incorporate one row for each app and use a dataset which has as number of rows the number of apps we considered.

In [None]:
# copy over the loaded dataframe
df_apps = df.copy()

In [None]:
# drop review specific features
df_apps.drop(['replycontent',
              'time',
              'content',
              'score_x',
              'username'],
             axis = 1,
             inplace = True)
# now each app has multiple duplicate rows
old_length = df_apps.shape[0]
df_apps.drop_duplicates(inplace = True)
new_length = df_apps.shape[0]
print('The reviews dataframe has {} rows while the apps dataframe has {} rows'.format(old_length,new_length))
# reset index since now IDs of reviews are not useful anymore
df_apps.reset_index(drop = True)

we decide to add a feature which is the logarithm of the number of ratings times score. The justification for the log is that we want the distribution to
be more concentrated.

In [None]:
df_apps['score_y_times_log_ratings'] = df_apps.score_y * np.log(df_apps.ratings)
apps_numerical_columns = numerical_columns
apps_numerical_columns.append('score_y_times_log_ratings')


In [None]:
df_apps['score_y'].plot(kind = 'density')
plt.title('Density Plot score_y Apps Dataset')
plt.savefig('imgs/apps_density_plot_score_y.png')

In [None]:
df_apps['score_y'].plot(kind = 'hist')
plt.title('Hisogram score_y Apps Dataset')
plt.savefig('imgs/apps_histogram_score_y.png')

In [None]:
df_apps['ratings'].plot(kind = 'density')
plt.title('Density Plot ratings Apps Dataset')
plt.savefig('imgs/apps_density_plot_ratings.png')

In [None]:
# the density is now well behaved
df_apps['score_y_times_log_ratings'].plot(kind = 'density')
plt.title('Density Plot score_y_times_log_ratings Apps Dataset')
plt.savefig('imgs/apps_density_plot_score_y_times_log_ratings.png')

In [None]:
# let's try with rating categories
# we might need to take a subsample of the big size categories
# the quantiles are purposely unbalanced to give more precision to high rating reviews
df_apps['score_level'] = pd.qcut(x=df_apps['score_y_times_log_ratings'], q = [0, 0.44,0.77, 1.0],
                     labels=['Low', 'Mid', 'High'])
df_apps['score_level'].hist()
plt.title('Histogram score_level Apps Dataset')
plt.savefig('imgs/apps_histogram_score_level.png')

## STATISTICAL MODELS FITTING

In [None]:
import matplotlib.pyplot as plt

In [None]:
import sklearn

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import metrics


In [None]:
from google.colab import files
up = files.upload()
import io
df  = pd.read_csv(io.BytesIO(up['df_combined_20000.csv']))

df.head(10)

### ANALYSIS ON SCORE OF THE SINGLE REVIEW (from 1 to 5 stars)


In [None]:
from scipy import stats

**LINEAR REGRESSION - Google**

In [None]:
Y = df['score_x']  #linear regression
X = df.drop(['score_x', 'app_name_x','ratings','score_y', 'app_name1'], axis=1)

In [None]:
reg = LinearRegression().fit(X,Y)

In [None]:
#reg.coef_
pd.DataFrame(zip(X.columns, reg.coef_))

In [None]:
#calculate p-values
params = np.append(reg.intercept_,reg.coef_)
predictions = reg.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
p_val = np.round(p_val,3)
p_val

In [None]:
#only on Google reviews

In [None]:
df1=df[df['google_play']==1]
df1.head()

In [None]:
Y = df1['score_x']
X = df1.drop(['google_play','score_x', 'app_name_x','ratings','score_y', 'app_name1', 'size'], axis=1)

In [None]:
reg = LinearRegression().fit(X,Y)
#reg.coef_
pd.DataFrame(zip(X.columns, reg.coef_))

calculate p-values

In [None]:

params = np.append(reg.intercept_,reg.coef_)
predictions = reg.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
p_val = np.round(p_val,3)
p_val

**LINEAR REGRESSION - Apple**

In [None]:
df2=df[df['google_play']==0] #only on Apple reviews
df2.head()

In [None]:
Y = df2['score_x']
X = df2.drop(['google_play','score_x', 'app_name_x','ratings','score_y', 'app_name1', 'size'], axis=1)

In [None]:
reg = LinearRegression().fit(X,Y)
#reg.coef_
pd.DataFrame(zip(X.columns, reg.coef_))

Calculate p-values

In [None]:

params = np.append(reg.intercept_,reg.coef_)
predictions = reg.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
p_val = np.round(p_val,3)
p_val

**LINEAR CLASSIFICATION - both Google and Apple**

In [None]:
Y = df['score_x_dummy'] #linear classification
X = df.drop(['score_x_dummy', 'score_x', 'app_name_x','ratings','score_y', 'app_name1'], axis=1)

In [None]:
logreg = LogisticRegression().fit(X,Y)
#classification coef
pd.DataFrame(zip(X.columns, logreg.coef_[0]))

In [None]:
#calculate p-values
params = np.append(reg.intercept_,reg.coef_)
predictions = reg.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
p_val = np.round(p_val,3)
p_val

**LOGISTIC REGRESSION - only Google**
the y is now a dummy with 1 being 4 or 5 stars (the best) and 0 from 1 to 3

In [None]:
df1['score_x_dummy'] = df1['score_x'].apply(lambda x: 1 if x > 3 else 0) #only Google
Y = df1['score_x_dummy']
X = df1.drop(['score_x_dummy', 'score_x', 'app_name_x','ratings','score_y', 'app_name1', 'google_play'], axis=1)

In [None]:
logreg = LogisticRegression().fit(X,Y)
#classification coef
pd.DataFrame(zip(X.columns, logreg.coef_[0]))

In [None]:
#calculate p-values
params = np.append(reg.intercept_,reg.coef_)
predictions = reg.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
p_val = np.round(p_val,3)
p_val

**LOGISTIC REGRESSION - only Apple**

In [None]:
df2['score_x_dummy'] = df2['score_x'].apply(lambda x: 1 if x > 3 else 0) #only Apple
Y = df2['score_x_dummy']
X = df2.drop(['score_x_dummy', 'score_x', 'app_name_x','ratings','score_y', 'app_name1', 'google_play'], axis=1)

In [None]:
logreg = LogisticRegression().fit(X,Y)
#classification coef
pd.DataFrame(zip(X.columns, logreg.coef_[0]))

In [None]:
#calculate p-values
params = np.append(reg.intercept_,reg.coef_)
predictions = reg.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
p_val = np.round(p_val,3)
p_val

**ORDERED LOGIT - Google**

In [None]:
import statsmodels.api as sm

In [None]:
df['ordered_score'] = pd.cut(df['score_x'], bins=5, labels=[1, 2, 3, 4, 5], ordered=True) #ordered logit
Y = df['ordered_score']
X = df.drop(['ordered_score','score_x_dummy','score_x', 'score_x_dummy1', 'app_name_x','ratings','score_y', 'app_name1', 'size'], axis=1)

In [None]:
mod_log = OrderedModel(Y,X,
                        distr='logit')

res_log = mod_log.fit(method='bfgs', disp=False)
res_log.summary()

In [None]:
#calculate p-values
params = np.append(reg.intercept_,reg.coef_)
predictions = reg.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
p_val = np.round(p_val,3)
p_val

In [None]:
df1['ordered_score'] = pd.cut(df1['score_x'], bins=5, labels=[1, 2, 3, 4, 5], ordered=True) #only Google
Y = df1['ordered_score']
X = df1.drop(['ordered_score','score_x_dummy','score_x', 'score_x_dummy1', 'app_name_x','ratings','score_y', 'app_name1', 'size', 'google_play'], axis=1)

In [None]:
mod_log = OrderedModel(Y,X,
                        distr='logit')

res_log = mod_log.fit(method='bfgs', disp=False)
res_log.summary()

In [None]:
#calculate p-values
params = np.append(reg.intercept_,reg.coef_)
predictions = reg.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
p_val = np.round(p_val,3)
p_val

**ORDERED LOGIT - Apple**

In [None]:
df2['ordered_score'] = pd.cut(df2['score_x'], bins=5, labels=[1, 2, 3, 4, 5], ordered=True) #only Apple
Y = df2['ordered_score']
X = df2.drop(['ordered_score','score_x_dummy','score_x', 'score_x_dummy1', 'app_name_x','ratings','score_y', 'app_name1', 'size', 'google_play'], axis=1)

In [None]:
mod_log = OrderedModel(Y,X,
                        distr='logit')

res_log = mod_log.fit(method='bfgs', disp=False)
res_log.summary()

In [None]:
#calculate p-values
params = np.append(reg.intercept_,reg.coef_)
predictions = reg.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
p_val = np.round(p_val,3)
p_val

## ANALYSIS OF FEATURES' EFFECT ON OVERALL APPLICATION SCORE (from 1 to 5 stars)


**REGRESSION**

In [None]:
Y = df_d['score_x']  #regression
X = df_d.drop(['score_x', 'price', 'Unnamed: 0', 'replyContent', 'ratings', 'score_y', "score_y", "at", "userName", "app_name","content", 'google_play', 'score_x_dummy1', 'ordered_score'], axis=1)

In [None]:
reg = LinearRegression().fit(X,Y)

In [None]:
#reg.coef_
pd.DataFrame(zip(X.columns, reg.coef_))

In [None]:
#calculate p-values
params = np.append(reg.intercept_,reg.coef_)
predictions = reg.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
p_val = np.round(p_val,3)
p_val

**CLASSIFICATION**

In [None]:
df['score_x_dummy'] = df['score_x'].apply(lambda x: 1 if x > 3 else 0) #classification
Y = df['score_x_dummy']
X = df.drop(['score_x', 'price', 'Unnamed: 0', 'replyContent', 'ratings', 'score_y', "score_y", "at", "userName", "app_name","content"], axis=1)

In [None]:
#calculate p-values
params = np.append(reg.intercept_,reg.coef_)
predictions = reg.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
p_val = np.round(p_val,3)
p_val

In [None]:
import statsmodels.api as sm

**ORDERED LOGIT**

In [None]:
df_d['ordered_score'] = pd.cut(df_d['score_x'], bins=5, labels=[1, 2, 3, 4, 5], ordered=True) #ordered logit
Y = df_d['ordered_score']
X = df_d.drop(['ordered_score','score_x_dummy1', 'google_play', 'score_x', 'price', 'Unnamed: 0', 'replyContent', 'ratings', 'score_y', "score_y", "at", "userName", "app_name","content"], axis=1)


In [None]:
mod_log = OrderedModel(Y,X,
                        distr='logit')

res_log = mod_log.fit(method='bfgs', disp=False)
res_log.summary()

In [None]:
#calculate p-values
params = np.append(reg.intercept_,reg.coef_)
predictions = reg.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
p_val = np.round(p_val,3)
p_val

## ANALYSIS ON SCORE OF THE APP multiplied per NUMBER OF RATINGS
This variable can be considered as proxy of success since it take into account both popularity and high quality

In [None]:
Y = df3['score*ratings']
X = df3.drop(['score*ratings','score_x_dummy','score_x', 'app_name_x','ratings','score_y', 'app_name1', 'size'], axis=1)

**REGRESSION**

In [None]:
reg = LinearRegression().fit(X,Y)
#reg.coef_
pd.DataFrame(zip(X.columns, reg.coef_))

In [None]:
#calculate p-values
params = np.append(reg.intercept_,reg.coef_)
predictions = reg.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
p_val = np.round(p_val,3)
p_val

**REGRESSION - only Google**

In [None]:
df4=df3[df['google_play']==1]  #only Google
df4.head()

In [None]:
Y = df4['score*ratings']
X = df4.drop(['score*ratings','score_x_dummy','score_x', 'app_name_x','ratings','score_y', 'app_name1', 'size', 'google_play'], axis=1)

In [None]:
reg = LinearRegression().fit(X,Y)
#reg.coef_
pd.DataFrame(zip(X.columns, reg.coef_))

In [None]:
#calculate p-values
params = np.append(reg.intercept_,reg.coef_)
predictions = reg.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
p_val = np.round(p_val,3)
p_val

**REGRESSION - only Apple**

In [None]:
df5=df3[df['google_play']==0] #only Apple
df5.head()

In [None]:
Y = df5['score*ratings']
X = df5.drop(['score*ratings','score_x_dummy','score_x', 'app_name_x','ratings','score_y', 'app_name1', 'size', 'google_play'], axis=1)

In [None]:
reg = LinearRegression().fit(X,Y)
#reg.coef_
pd.DataFrame(zip(X.columns, reg.coef_))

In [None]:
#calculate p-values
params = np.append(reg.intercept_,reg.coef_)
predictions = reg.predict(X)
new_X = np.append(np.ones((len(X),1)), X, axis=1)
M_S_E = (sum((Y-predictions)**2))/(len(new_X)-len(new_X[0]))
v_b = M_S_E*(np.linalg.inv(np.dot(new_X.T,new_X)).diagonal())
s_b = np.sqrt(v_b)
t_b = params/ s_b
p_val =[2*(1-stats.t.cdf(np.abs(i),(len(new_X)-len(new_X[0])))) for i in t_b]
p_val = np.round(p_val,3)
p_val

## ANALYSIS ON NUMBER OF REVIEWS AND ANALYSIS OF SCORE OF THE APP
I try to understand whether the previous results are driven by one of the two factor of the multiplication

In [None]:
Y = df['ratings']
X = df.drop(['score*ratings','score_x_dummy','score_x', 'app_name_x','ratings','score_y', 'app_name1', 'size'], axis=1)

In [None]:
Y = df['score_y']
X = df.drop(['score*ratings','ratings','score_x_dummy','score_x', 'app_name_x','ratings','score_y', 'app_name1', 'size'], axis=1)

In [None]:
reg = LinearRegression().fit(X,Y)
#reg.coef_
pd.DataFrame(zip(X.columns, reg.coef_))

In [None]:
df1=df[df['google_play']==1]  #only Google
df1.head()

In [None]:
Y = df1['ratings']
X = df1.drop(['score*ratings','score_x_dummy','score_x', 'app_name_x','ratings','score_y', 'app_name1', 'size', 'google_play'], axis=1)

In [None]:
reg = LinearRegression().fit(X,Y)
#reg.coef_
pd.DataFrame(zip(X.columns, reg.coef_))

In [None]:
Y = df1['score_y']
X = df1.drop(['score*ratings','ratings','score_x_dummy','score_x', 'app_name_x','ratings','score_y', 'app_name1', 'size', 'google_play'], axis=1)

In [None]:
reg = LinearRegression().fit(X,Y)
#reg.coef_
pd.DataFrame(zip(X.columns, reg.coef_))

In [None]:
df2=df[df['google_play']==0] #only Apple

In [None]:
Y = df2['ratings']
X = df2.drop(['score*ratings','score_x_dummy','score_x', 'app_name_x','ratings','score_y', 'app_name1', 'size', 'google_play', 'log_ratings', 'score*log(ratings)'], axis=1)

In [None]:
reg = LinearRegression().fit(X,Y)
#reg.coef_
pd.DataFrame(zip(X.columns, reg.coef_))

In [None]:
Y = df2['score_y']
X = df2.drop(['score*ratings','ratings','score_x_dummy','score_x', 'app_name_x','ratings','score_y', 'app_name1', 'size', 'google_play', 'log_ratings', 'score*log(ratings)'], axis=1)

In [None]:
reg = LinearRegression().fit(X,Y)
#reg.coef_
pd.DataFrame(zip(X.columns, reg.coef_))

# FREQUENT MONOGRAM ANALYSIS

In [None]:
from wordcloud import WordCloud
wc = WordCloud(background_color="white", max_words=250, colormap="Set2")
from nltk import sent_tokenize, word_tokenize

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop = stopwords.words('english')
stop = stop + ['app', 'APP' ,'ap', 'App', 'apps', 'application', 'browser', 'website', 'websites', 'chrome', 'click', 'web', 'ip', 'address',
            'files', 'android', 'browse', 'service', 'use', 'one', 'download', 'email', 'Launcher', 'video', 'Video', 'photo', 'Photo','edit', 'editing', 'video', 'good']

In [None]:
## Wordcloud Function
def wc(data,bgcolor,title):
    plt.figure(figsize = (100,100))
    wc = WordCloud(background_color = bgcolor, max_words = 1000,  max_font_size = 50, )
    wc.generate(' '.join(data))
    plt.imshow(wc)
    plt.axis('off')
    plt.title('Common Words in Reviews')

## Frequency of words in translated review column
from collections import Counter
from nltk.tokenize import RegexpTokenizer
#from stop_words import get_stop_words
import re

top_N = 100

pos_review_lower = df_reviews[df_reviews['score']>=4]['content'].str.lower().str.cat(sep=' ')
neg_review_lower = df_reviews[df_reviews['score']<3]['content'].str.lower().str.cat(sep=' ')
neu_review_lower = df_reviews[df_reviews['score']==3]['content'].str.lower().str.cat(sep=' ')


## Remove Punctuations
pos_review_remove_pun = re.sub('[^A-Za-z]+', ' ', pos_review_lower)
neg_review_remove_pun = re.sub('[^A-Za-z]+', ' ', neg_review_lower)
neu_review_remove_pun = re.sub('[^A-Za-z]+', ' ', neu_review_lower)

#remove all the stopwords from the text
pos_word_tokens_tags = word_tokenize(pos_review_remove_pun)
neg_word_tokens_tags = word_tokenize(neg_review_remove_pun)
neu_word_tokens_tags = word_tokenize(neu_review_remove_pun)
pos_filtered_sentence_tags = [w_tags for w_tags in pos_word_tokens_tags if not w_tags in stop]
pos_filtered_sentence_tags = []
for w_tags in pos_word_tokens_tags:
    if w_tags not in stop:
        pos_filtered_sentence_tags.append(w_tags)

neg_filtered_sentence_tags = [w_tags for w_tags in neg_word_tokens_tags if not w_tags in stop]
neg_filtered_sentence_tags = []
for w_tags in neg_word_tokens_tags:
    if w_tags not in stop:
        neg_filtered_sentence_tags.append(w_tags)

neu_filtered_sentence_tags = [w_tags for w_tags in neu_word_tokens_tags if not w_tags in stop]
neu_filtered_sentence_tags = []
for w_tags in neu_word_tokens_tags:
    if w_tags not in stop:
        neu_filtered_sentence_tags.append(w_tags)

# Remove characters which have length less than 2

pos_without_single_chr_rev = [word_tags for word_tags in pos_filtered_sentence_tags if len(word_tags) > 2]
neg_without_single_chr_rev = [word_tags for word_tags in neg_filtered_sentence_tags if len(word_tags) > 2]
neu_without_single_chr_rev = [word_tags for word_tags in neu_filtered_sentence_tags if len(word_tags) > 2]

In [None]:
#image of negative words
wc(neg_without_single_chr_rev,'white','Common Words' )

In [None]:
#image of positive words
wc(pos_without_single_chr_rev,'white','Common Words' )

# FREQUENT BIGRAM
I try to understand whether there is a relation between good (or bad) reviews and some frequent bigram (group of two words)

In [None]:
#remove NaN
dataset["content"]= dataset["content"].fillna("")

In [None]:
from wordcloud import WordCloud
wc = WordCloud(background_color="white", max_words=250, colormap="Set2")
from nltk import sent_tokenize, word_tokenize

from collections import Counter
from nltk.tokenize import RegexpTokenizer
import re


import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop = stopwords.words('english')
stop = stop + ['app', 'APP' ,'ap', 'App', 'apps', 'application', 'browser', 'website', 'websites', 'chrome', 'click', 'web', 'ip', 'address',
            'files', 'android', 'browse', 'service', 'use', 'one', 'download', 'email', 'Launcher','please','love','it','the','i',
              'I','my','like','really','every','would','even','though','game','review']


def wc(data,bgcolor,title):
    plt.figure(figsize = (100,100))
    wc = WordCloud(background_color = bgcolor, max_words = 300,  max_font_size = 50, )
    wc.generate(' '.join(data))
    plt.imshow(wc)
    plt.axis('off')
    plt.title('Common Phrases in Reviews')



def pair_split(x):
    words = re.sub('[^A-Za-z_]+', ' ', x)
    words = words.split()
    words_new = [x for x in words if x not in stop]
    if len(words_new) == 1:
        return words_new
    else:
        pairs = [words_new[i]+'_'+words_new[i+1] for i in range(len(words_new)-1)]
        return pairs



## Frequency of words in translated review column

top_N = 100

## Get every pair of words from the reviews

pos_review_lower = g_df[g_df['score']>=4]['content'].str.lower().apply(pair_split).apply(lambda x: " ".join(x)).str.cat(sep=' ')
neg_review_lower = g_df[g_df['score']<3]['content'].str.lower().apply(pair_split).apply(lambda x: " ".join(x)).str.cat(sep=' ')
neu_review_lower = g_df[g_df['score']==3]['content'].str.lower().apply(pair_split).apply(lambda x: " ".join(x)).str.cat(sep=' ')


pos_review_lower_rem = pos_review_lower.split(' ')
pos_review_lower_rem = [a for a  in pos_review_lower_rem if a.find('_') >0]
pos_review_remove_pun = " ".join(pos_review_lower_rem)

neg_review_lower_rem = neg_review_lower.split(' ')
neg_review_lower_rem = [a for a  in neg_review_lower_rem if a.find('_') >0]
neg_review_remove_pun = " ".join(neg_review_lower_rem)

neu_review_lower_rem = neu_review_lower.split(' ')
neu_review_lower_rem = [a for a  in neu_review_lower_rem if a.find('_') >0]
neu_review_remove_pun = " ".join(neu_review_lower_rem)


pos_word_tokens_tags = word_tokenize(pos_review_remove_pun)
neg_word_tokens_tags = word_tokenize(neg_review_remove_pun)
neu_word_tokens_tags = word_tokenize(neu_review_remove_pun)
pos_filtered_sentence_tags = [w_tags for w_tags in pos_word_tokens_tags if not w_tags in stop]
pos_filtered_sentence_tags = []
for w_tags in pos_word_tokens_tags:
    if w_tags not in stop:
        pos_filtered_sentence_tags.append(w_tags)

neg_filtered_sentence_tags = [w_tags for w_tags in neg_word_tokens_tags if not w_tags in stop]
neg_filtered_sentence_tags = []
for w_tags in neg_word_tokens_tags:
    if w_tags not in stop:
        neg_filtered_sentence_tags.append(w_tags)

neu_filtered_sentence_tags = [w_tags for w_tags in neu_word_tokens_tags if not w_tags in stop]
neu_filtered_sentence_tags = []
for w_tags in neu_word_tokens_tags:
    if w_tags not in stop:
        neu_filtered_sentence_tags.append(w_tags)

# Remove characters which have length less than 2

pos_without_single_chr_rev = [word_tags for word_tags in pos_filtered_sentence_tags if len(word_tags) > 2]
neg_without_single_chr_rev = [word_tags for word_tags in neg_filtered_sentence_tags if len(word_tags) > 2]
neu_without_single_chr_rev = [word_tags for word_tags in neu_filtered_sentence_tags if len(word_tags) > 2]


In [None]:
#count negative
from collections import Counter
counts = Counter(neg_without_single_chr_rev)
count_top50 = counts.most_common(50)
count_top50

In [None]:
#graph frequency negative
import seaborn as sns
count_top30_df = pd.DataFrame(count_top30, columns=["Phrases","Count"])
plt.figure(figsize=(8, 6))
sns.set(font_scale=1)
category_plot = sns.barplot(x="Phrases",y ="Count",data=count_top30_df, palette = "RdYlBu")
category_plot.set_xticklabels(category_plot.get_xticklabels(), rotation=90, ha="right")
plt.title('Common Phrases in Negative Reviews',size = 18)

In [None]:
 #count positive
from collections import Counter
counts = Counter(pos_without_single_chr_rev)
count_top30 = counts.most_common(50)
count_top30

In [None]:
#graph positive
count_top30_df = pd.DataFrame(count_top30, columns=["Phrases","Count"])
plt.figure(figsize=(8, 6))
sns.set(font_scale=1)
category_plot = sns.barplot(x="Phrases",y ="Count",data=count_top30_df, palette = "YlGnBu_r")
category_plot.set_xticklabels(category_plot.get_xticklabels(), rotation=90, ha="right")
plt.title('Common Phrases in Positive Reviews',size = 18)