In [None]:
import pandas as pd
import numpy as np
from  datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import f_oneway

from sklearn.ensemble import RandomForestRegressor

pd.set_option('display.max_columns', 500)

In [None]:
data_down = pd.read_csv('data_down_full.csv')
data_up = pd.read_csv('data_up_full.csv')

    
#gathering pysical cell IDs cols in lists for both download and upload
pci_down = []
for col in data_down.columns:
    if col.isdigit():
           pci_down.append(col)
pci_up = []
for col in data_up.columns:
    if col.isdigit():
           pci_up.append(col)

# RFR Feature Importance


In [None]:
data_down = data_down.drop(['rawTimesamp','connected', 'date', 'timestamp', 'id'], axis = 1)
data_up = data_up.drop(['rawTimesamp','connected', 'date', 'timestamp', 'id'], axis = 1)

In [None]:
nom_attributes_up =['measurement', 'hour', 'week', 'dayofweek', 'month', 'day_time', 
                    'rush_hour', 'location', 'campus', 'highway', 'urban', 'suburban', 
                    'Barop', 'Brünninghausen', 'Eichlinghofen', 'Groß-Barop',
                    'Hombruch', 'Innenstadt Nord', 'KGV Ruhrwaldstraße', 'Kirchhörde', 
                    'Klinikviertel', 'Kruckel', 'Löttringhausen', 'Lücklemberg', 'Mitte', 
                    'Persebeck', 'Renninghausen', 'Salingen', 'Syburg', 'Wellinghofen',
                    'Wichlinghofen']

nom_attributes_up = nom_attributes_up + pci_up


nom_attributes_down =['measurement', 'hour', 'week', 'dayofweek', 'month', 'day_time', 
                      'rush_hour', 'location', 'campus', 'highway', 'urban', 'suburban',  
                      'Barop', 'Brünninghausen', 'Eichlinghofen', 'Groß-Barop', 
                      'Hombruch', 'KGV Ruhrwaldstraße', 'Kirchhörde', 'Klinikviertel', 
                      'Kruckel', 'Löttringhausen', 'Lücklemberg', 'Mitte' , 'Persebeck', 
                      'Renninghausen', 'Salingen', 'Syburg', 'Wellinghofen',
                      'Wichlinghofen']

nom_attributes_down  =nom_attributes_down + pci_down


cardinal_attributes_up = ['throughput', 'lat', 'lon', 'alt',
                       'speed', 'alt', 'acc', 'rsrp', 'rsrq',
                       'sinr', 'cqi', 'ss', 'ta', 'payload', 'rtt',
                       'tavg', 'tmin', 'tmax', 'prcp', 'wdir', 'wspd',
                       'pres', 'tsun','throughput_mean', 'throughput_var', 
                       'throughput_std','txPower', 'distance', 'speed', 
                       'ta', 'year']

cardinal_attributes_down = ['throughput', 'lat', 'lon', 'alt',
                       'speed', 'alt', 'acc', 'rsrp', 'rsrq',
                       'sinr', 'cqi', 'ss', 'ta', 'payload', 'rtt',
                       'tavg', 'tmin', 'tmax', 'prcp', 'wdir', 'wspd',
                       'pres', 'tsun','throughput_mean', 'throughput_var', 
                       'throughput_std', 'distance', 'speed', 'ta', 'year']

In [None]:
reg = RandomForestRegressor()

X_up = data_up.drop('throughput', axis=1)
y_up = data_up[['throughput']].values.ravel()


reg.fit(X_up, y_up)

In [None]:
df_feature_imp = pd.DataFrame(reg.feature_importances_ , index=X_up.columns, columns=['feature importance']).sort_values(
    'feature importance', ascending=False)
pd.set_option('display.max_rows', df_feature_imp.shape[0]+1)
pd.set_option('display.float_format', '{:.10f}'.format)
print(len(df_feature_imp))
df_feature_imp.head(1000)

In [None]:
reg = RandomForestRegressor()

X_down= data_down.drop('throughput', axis=1)
y_down = data_down[['throughput']].values.ravel()


reg.fit(X_down, y_down)

In [None]:
reg = RandomForestRegressor()
reg.fit(X_down, y_down)

In [None]:
df_feature_imp = pd.DataFrame(reg.feature_importances_ , index=X_down.columns, columns=['feature importance']).sort_values(
    'feature importance', ascending=False)
pd.set_option('display.max_rows', df_feature_imp.shape[0]+1)
pd.set_option('display.float_format', '{:.10f}'.format)
print(len(df_feature_imp))
df_feature_imp.head(1000)


## Correlation Analysis

In [None]:
for cardinal_a in cardinal_attributes_down:
    if cardinal_a == 'throughput': pass
    else:
        if pearsonr(data_down[cardinal_a], data_down['throughput'])[1] <=0.05:
            print (cardinal_a)
            print(pearsonr(data_down[cardinal_a], data_down['throughput']))
            print(spearmanr(data_down[cardinal_a], data_down['throughput']))
            plt.scatter(data_down[cardinal_a], data_down['throughput'])
            plt.ylabel('throughput')
            plt.xlabel(cardinal_a)
            plt.show()

In [None]:
for cardinal_a in cardinal_attributes_up:
    if cardinal_a == 'throughput': pass
    else:
        if pearsonr(data_up[cardinal_a], data_up['throughput'])[1] <=0.05:
            print (cardinal_a)
            print(pearsonr(data_up[cardinal_a], data_up['throughput']))
            print(spearmanr(data_up[cardinal_a], data_up['throughput']))
            plt.scatter(data_up[cardinal_a], data_up['throughput'])
            plt.ylabel('throughput')
            plt.xlabel(cardinal_a)
            plt.show()

In [None]:
# One-Way Anova test for districts 

print(f_oneway(data_down['throughput'][data_down['Barop'] == 1],
         data_down['throughput'][data_down['Brünninghausen'] == 1],
         data_down['throughput'][data_down['Eichlinghofen'] == 1],
         data_down['throughput'][data_down['Groß-Barop'] == 1],
         data_down['throughput'][data_down['Hombruch'] == 1],
         data_down['throughput'][data_down['KGV Ruhrwaldstraße'] == 1],
         data_down['throughput'][data_down['Kirchhörde'] == 1],
         data_down['throughput'][data_down['Klinikviertel'] == 1],
         data_down['throughput'][data_down['Kruckel'] == 1],
         data_down['throughput'][data_down['Löttringhausen'] == 1],
         data_down['throughput'][data_down['Lücklemberg'] == 1],
         data_down['throughput'][data_down['Mitte'] == 1],
         data_down['throughput'][data_down['Persebeck'] == 1],
         data_down['throughput'][data_down['Renninghausen'] == 1],
         data_down['throughput'][data_down['Salingen'] == 1],
         data_down['throughput'][data_down['Syburg'] == 1],
         data_down['throughput'][data_down['Wellinghofen'] == 1],
         data_down['throughput'][data_down['Wichlinghofen'] == 1]))


print(f_oneway(data_up['throughput'][data_up['Barop'] == 1],
         data_up['throughput'][data_up['Brünninghausen'] == 1],
         data_up['throughput'][data_up['Eichlinghofen'] == 1],
         data_up['throughput'][data_up['Groß-Barop'] == 1],
         data_up['throughput'][data_up['Hombruch'] == 1],     
         data_up['throughput'][data_up['KGV Ruhrwaldstraße'] == 1],
         data_up['throughput'][data_up['Kirchhörde'] == 1],
         data_up['throughput'][data_up['Klinikviertel'] == 1],
         data_up['throughput'][data_up['Kruckel'] == 1],
         data_up['throughput'][data_up['Löttringhausen'] == 1],
         data_up['throughput'][data_up['Lücklemberg'] == 1],
         data_up['throughput'][data_up['Mitte'] == 1],
         data_up['throughput'][data_up['Persebeck'] == 1],
         data_up['throughput'][data_up['Renninghausen'] == 1],
         data_up['throughput'][data_up['Salingen'] == 1],
         data_up['throughput'][data_up['Syburg'] == 1],
         data_up['throughput'][data_up['Wellinghofen'] == 1]))

In [None]:
# One-Way Anova test for day_time

print(f_oneway(data_down['throughput'][data_down['day_time'] == 1],
         data_down['throughput'][data_down['day_time'] == 2],
         data_down['throughput'][data_down['day_time'] == 3],
         data_down['throughput'][data_down['day_time'] == 4]))

print(f_oneway(data_up['throughput'][data_up['day_time'] == 1],
         data_up['throughput'][data_up['day_time'] == 2],
         data_up['throughput'][data_up['day_time'] == 3],
         data_up['throughput'][data_up['day_time'] == 4]))

In [None]:
# One-Way Anova test for dayofweek

print(f_oneway(data_down['throughput'][data_down['dayofweek'] == 0],
         data_down['throughput'][data_down['dayofweek'] == 1],
         data_down['throughput'][data_down['dayofweek'] == 2],
         data_down['throughput'][data_down['dayofweek'] == 3],
         data_down['throughput'][data_down['dayofweek'] == 4]))

print(f_oneway(data_up['throughput'][data_up['dayofweek'] == 0],
         data_up['throughput'][data_up['dayofweek'] == 1],
         data_up['throughput'][data_up['dayofweek'] == 2],
         data_up['throughput'][data_up['dayofweek'] == 3],
         data_up['throughput'][data_up['dayofweek'] == 4]))

In [None]:
# One-Way Anova test for dayofmonth

print(f_oneway(data_down['throughput'][data_down['dayofmonth'] == 10],
         data_down['throughput'][data_down['dayofmonth'] == 11],
         data_down['throughput'][data_down['dayofmonth'] == 12],
         data_down['throughput'][data_down['dayofmonth'] == 13],
         data_down['throughput'][data_down['dayofmonth'] == 14],
         data_down['throughput'][data_down['dayofmonth'] == 17],
         data_down['throughput'][data_down['dayofmonth'] == 18],
         data_down['throughput'][data_down['dayofmonth'] == 19],
         data_down['throughput'][data_down['dayofmonth'] == 20],
         data_down['throughput'][data_down['dayofmonth'] == 15],
         data_down['throughput'][data_down['dayofmonth'] == 16]))

print(f_oneway(data_up['throughput'][data_up['dayofmonth'] == 10],
         data_up['throughput'][data_up['dayofmonth'] == 11],
         data_up['throughput'][data_up['dayofmonth'] == 12],
         data_up['throughput'][data_up['dayofmonth'] == 13],
         data_up['throughput'][data_up['dayofmonth'] == 14],
         data_up['throughput'][data_up['dayofmonth'] == 17],
         data_up['throughput'][data_up['dayofmonth'] == 18],
         data_up['throughput'][data_up['dayofmonth'] == 19],
         data_up['throughput'][data_up['dayofmonth'] == 20],
         data_up['throughput'][data_up['dayofmonth'] == 15],
         data_up['throughput'][data_up['dayofmonth'] == 16]))

In [None]:
# One-Way Anova test for month

print(f_oneway(data_down['throughput'][data_down['month'] == 12],
         data_down['throughput'][data_down['month'] == 1]))

print(f_oneway(data_up['throughput'][data_up['month'] == 12],
         data_up['throughput'][data_up['month'] == 1]))

In [None]:
# One-Way Anova test for week

print(f_oneway(data_down['throughput'][data_down['week'] == 50],
         data_down['throughput'][data_down['week'] == 51],
         data_down['throughput'][data_down['week'] == 3]))

print(f_oneway(data_up['throughput'][data_up['week'] == 50],
         data_up['throughput'][data_up['week'] == 51],
         data_up['throughput'][data_up['week'] == 3]))

In [None]:
# One-Way Anova test for pysical cell IDs

print(f_oneway(data_down['throughput'][data_down['496'] == 1],
         data_down['throughput'][data_down['495'] == 1],
         data_down['throughput'][data_down['488'] == 1],
         data_down['throughput'][data_down['487'] == 1],
         data_down['throughput'][data_down['486'] == 1],
         data_down['throughput'][data_down['470'] == 1],
         data_down['throughput'][data_down['449'] == 1],
         data_down['throughput'][data_down['443'] == 1],
         data_down['throughput'][data_down['432'] == 1],
         data_down['throughput'][data_down['416'] == 1],
         data_down['throughput'][data_down['415'] == 1],
         data_down['throughput'][data_down['414'] == 1],
         data_down['throughput'][data_down['377'] == 1],
         data_down['throughput'][data_down['376'] == 1],
         data_down['throughput'][data_down['374'] == 1],
         data_down['throughput'][data_down['372'] == 1],
         data_down['throughput'][data_down['370'] == 1],
         data_down['throughput'][data_down['358'] == 1],
         data_down['throughput'][data_down['354'] == 1],
         data_down['throughput'][data_down['335'] == 1],
         data_down['throughput'][data_down['331'] == 1],
         data_down['throughput'][data_down['326'] == 1],
         data_down['throughput'][data_down['324'] == 1],
         data_down['throughput'][data_down['281'] == 1],
         data_down['throughput'][data_down['274'] == 1],
         data_down['throughput'][data_down['272'] == 1],
         data_down['throughput'][data_down['271'] == 1],
         data_down['throughput'][data_down['270'] == 1],
         data_down['throughput'][data_down['249'] == 1],
         data_down['throughput'][data_down['224'] == 1],
         data_down['throughput'][data_down['223'] == 1],
         data_down['throughput'][data_down['222'] == 1],
         data_down['throughput'][data_down['219'] == 1],
         data_down['throughput'][data_down['215'] == 1],
         data_down['throughput'][data_down['213'] == 1],
         data_down['throughput'][data_down['202'] == 1],
         data_down['throughput'][data_down['199'] == 1],
         data_down['throughput'][data_down['195'] == 1],
         data_down['throughput'][data_down['167'] == 1],
         data_down['throughput'][data_down['166'] == 1],
         data_down['throughput'][data_down['165'] == 1],
         data_down['throughput'][data_down['155'] == 1],
         data_down['throughput'][data_down['128'] == 1],
         data_down['throughput'][data_down['96'] == 1],
         data_down['throughput'][data_down['95'] == 1],
         data_down['throughput'][data_down['93'] == 1],
         data_down['throughput'][data_down['86'] == 1],
         data_down['throughput'][data_down['85'] == 1],
         data_down['throughput'][data_down['68'] == 1],
         data_down['throughput'][data_down['67'] == 1],
         data_down['throughput'][data_down['66'] == 1],
         data_down['throughput'][data_down['49'] == 1],
         data_down['throughput'][data_down['48'] == 1],
         data_down['throughput'][data_down['35'] == 1],
         data_down['throughput'][data_down['33'] == 1],
         data_down['throughput'][data_down['26'] == 1],
         data_down['throughput'][data_down['25'] == 1],
         data_down['throughput'][data_down['24'] == 1]))

In [None]:
# One-Way Anova test for pysical cell IDs

print(f_oneway(data_up['throughput'][data_up['496'] == 1],
         data_up['throughput'][data_up['495'] == 1],
         data_up['throughput'][data_up['488'] == 1],
         data_up['throughput'][data_up['487'] == 1],
         data_up['throughput'][data_up['486'] == 1],
         data_up['throughput'][data_up['470'] == 1],
         data_up['throughput'][data_up['449'] == 1],
         data_up['throughput'][data_up['443'] == 1],
         data_up['throughput'][data_up['441'] == 1],
         data_up['throughput'][data_up['434'] == 1],
         data_up['throughput'][data_up['433'] == 1],
         data_up['throughput'][data_up['432'] == 1],
         data_up['throughput'][data_up['416'] == 1],
         data_up['throughput'][data_up['415'] == 1],
         data_up['throughput'][data_up['414'] == 1],
         data_up['throughput'][data_up['377'] == 1],
         data_up['throughput'][data_up['376'] == 1],
         data_up['throughput'][data_up['375'] == 1],
         data_up['throughput'][data_up['374'] == 1],
         data_up['throughput'][data_up['372'] == 1],
         data_up['throughput'][data_up['370'] == 1],
         data_up['throughput'][data_up['358'] == 1],
         data_up['throughput'][data_up['354'] == 1],
         data_up['throughput'][data_up['335'] == 1],
         data_up['throughput'][data_up['331'] == 1],
         data_up['throughput'][data_up['326'] == 1],
         data_up['throughput'][data_up['271'] == 1],
         data_up['throughput'][data_up['274'] == 1],
         data_up['throughput'][data_up['272'] == 1],
         data_up['throughput'][data_up['271'] == 1],
         data_up['throughput'][data_up['270'] == 1],
         data_up['throughput'][data_up['249'] == 1],
         data_up['throughput'][data_up['224'] == 1],
         data_up['throughput'][data_up['223'] == 1],
         data_up['throughput'][data_up['222'] == 1],
         data_up['throughput'][data_up['219'] == 1],
         data_up['throughput'][data_up['215'] == 1],
         data_up['throughput'][data_up['213'] == 1],
         data_up['throughput'][data_up['202'] == 1],
         data_up['throughput'][data_up['199'] == 1],
         data_up['throughput'][data_up['195'] == 1],
         data_up['throughput'][data_up['167'] == 1],
         data_up['throughput'][data_up['166'] == 1],
         data_up['throughput'][data_up['165'] == 1],
         data_up['throughput'][data_up['155'] == 1],
         data_up['throughput'][data_up['128'] == 1],
         data_up['throughput'][data_up['96'] == 1],
         data_up['throughput'][data_up['95'] == 1],
         data_up['throughput'][data_up['94'] == 1],
         data_up['throughput'][data_up['93'] == 1],
         data_up['throughput'][data_up['86'] == 1],
         data_up['throughput'][data_up['85'] == 1],
         data_up['throughput'][data_up['68'] == 1],
         data_up['throughput'][data_up['67'] == 1],
         data_up['throughput'][data_up['66'] == 1],
         data_up['throughput'][data_up['49'] == 1],
         data_up['throughput'][data_up['48'] == 1],
         data_up['throughput'][data_up['35'] == 1],
         data_up['throughput'][data_up['33'] == 1],
         data_up['throughput'][data_up['26'] == 1],
         data_up['throughput'][data_up['25'] == 1],
         data_up['throughput'][data_up['24'] == 1]))

In [None]:
# One-Way Anova test for rush_hour

print(f_oneway(data_down['throughput'][data_down['rush_hour'] == 1],
         data_down['throughput'][data_down['rush_hour'] == 0]))

print(f_oneway(data_up['throughput'][data_up['rush_hour'] == 1],
         data_up['throughput'][data_up['rush_hour'] == 0]))

In [None]:
# One-Way Anova test for hour

print(f_oneway(data_down['throughput'][data_down['hour'] == 6],
               data_down['throughput'][data_down['hour'] == 9],
               data_down['throughput'][data_down['hour'] == 10],
               data_down['throughput'][data_down['hour'] == 12],
               data_down['throughput'][data_down['hour'] == 13],
               data_down['throughput'][data_down['hour'] == 14],
               data_down['throughput'][data_down['hour'] == 15]))

print(f_oneway(data_up['throughput'][data_up['hour'] == 6],
               data_up['throughput'][data_up['hour'] == 9],
               data_up['throughput'][data_up['hour'] == 10],
               data_up['throughput'][data_up['hour'] == 12],
               data_up['throughput'][data_up['hour'] == 13],
               data_up['throughput'][data_up['hour'] == 14],
               data_up['throughput'][data_up['hour'] == 15]))

In [None]:
print(f_oneway(data_down['throughput'][data_down['suburban'] == 1],
               data_down['throughput'][data_down['highway'] == 1],
               data_down['throughput'][data_down['campus'] == 1],
               data_down['throughput'][data_down['urban'] == 1]))

print(f_oneway(data_up['throughput'][data_up['suburban'] == 1],
               data_up['throughput'][data_up['highway'] == 1],
               data_up['throughput'][data_up['campus'] == 1],
               data_up['throughput'][data_up['urban'] == 1]))

# Heatmaps

In [None]:
'''
Upload Data
Kardinal vs. Kardinal
'''
plt.figure(figsize=(25, 25))
cor = data_up[cardinal_attributes_up].corr(method = 'pearson')
sns.heatmap(cor, center=0,annot=True)
plt.tight_layout()

In [None]:
'''
Download Data
Kardinal vs. Kardinal
'''
plt.figure(figsize=(25, 25))
cor = data_down[cardinal_attributes_down].corr(method = 'pearson')
sns.heatmap(cor, center=0,annot=True)
plt.tight_layout()