In [None]:
#pip install geopy

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from my_defs import* 

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [None]:
data_down = pd.read_csv('Data_down.csv', sep='\s+')   #your oath here 
data_up = pd.read_csv('Data_up.csv', sep='\s+')    #your oath here 

# Datetime-related features

In [None]:
# Converting rawTimesamp from Unix to Datetime
data_up['date'] = data_up['rawTimesamp'] 
data_down['date'] = data_down['rawTimesamp']

data_down['rawTimesamp'] = data_down['rawTimesamp'].map(   #adding '%Y-%m-%d %H:%M:%S'
    lambda x: unix_to_datetime(x))   # function in my_defs

data_up['rawTimesamp'] = data_up['rawTimesamp'].map(
    lambda x: unix_to_datetime(x)) # function in my_defs


data_down['date'] = data_down['date'].map(    #adding '%Y-%m-%d'
    lambda x: unix_to_date(x))   # function in my_defs

data_up['date'] = data_up['date'].map(
    lambda x: unix_to_date(x))   # function in my_defs

In [None]:
#checking data types after date-transformations
print('Download Data Types\n',data_down.dtypes)
print('Upload Data Types\n',data_up.dtypes)

In [None]:
# generating the cols: hour, week, year, dayofweek, dayofmonth and month
# and categorically encoding daytime


# the definition 'encode_day_time_cat' calls 'featurize_datetime' inside it  

data_up = encode_day_time_cat(data_up, 'hour', 'day_time')   # function in my_defs
data_down = encode_day_time_cat(data_down, 'hour', 'day_time')   # function in my_defs

data_up

In [None]:
# generating the col rush_hour
'''
this first function returns 1 if time is between [06-09 or 16-19]
or 0 otherwise
'''
data_up['rush_hour'] = data_up['hour'].map(
    lambda x: 1 if x >= 6 and x< 9 else (1 if x >= 16 and x< 9 else 0 ))  
data_down['rush_hour'] = data_down['hour'].map(
    lambda x: 1 if x >= 6 and x< 9 else (1 if x >= 16 and x< 9 else 0 )) 


In [None]:
def print_unique_values(dataset, u_col = ""):
    print('unique values', u_col , '=', len(data_down[u_col].unique()), '\n',
         data_down[u_col].unique(), '\n')

In [None]:
print_unique_values(data_down,u_col = "hour" )
print_unique_values(data_down,u_col = "week" )
print_unique_values(data_down,u_col = "dayofweek" )
print_unique_values(data_down,u_col = "month" )
print_unique_values(data_down,u_col = "dayofmonth" )

In [None]:
print_unique_values(data_up,u_col = "hour" )
print_unique_values(data_up,u_col = "week" )
print_unique_values(data_up,u_col = "dayofweek" )
print_unique_values(data_up,u_col = "month" )
print_unique_values(data_up,u_col = "dayofmonth" )

In [None]:
print('Upload Measurements count in 2018:',
      data_up[data_up['year'] == 2018].groupby('measurement').count().shape[0])

print('Upload Measurements count in 2019:',
      data_up[data_up['year'] == 2019].groupby('measurement').count().shape[0])

print('Download Measurements count in 2018:',
      data_down[data_down['year'] == 2018].groupby('measurement').count().shape[0])

print('Download Measurements count in 2019:',
      data_down[data_down['year'] == 2019].groupby('measurement').count().shape[0])

# Add Weather Features

In [None]:
weather_data = pd.read_csv('DE-8GBU_18_19.csv', parse_dates=['date'])
weather_data.head()

In [None]:
unique_dates = data_up['date'].unique()

In [None]:
weather_data.loc[weather_data['date'].isin(unique_dates)].describe()

In [None]:
weather_data = weather_data.drop(['wpgt', 'snow'], axis = 1)

In [None]:
data_up = pd.merge(data_up,weather_data, how='inner', on='date' )
data_down = pd.merge(data_down,weather_data, how='inner', on='date')

# Location One-Hot Encoding

In [None]:
data_up = onehot_enc(data_up, 'location') # function in my_defs
data_down = onehot_enc(data_down, 'location')# function in my_defs

In [None]:
data_down

# Adding Rolling Features

In [None]:
rolling_down = data_down[['rawTimesamp', 'measurement',
                          'throughput']].sort_values(by=['measurement','rawTimesamp'] )
rolling_up = data_up[['rawTimesamp', 'measurement', 
                      'throughput']].sort_values(by=['measurement','rawTimesamp'] )

In [None]:
rolling_down = means_and_std(rolling_down,rolling_col = 'measurement',
                            target_col = 'throughput')  # function in my_defs
rolling_up = means_and_std(rolling_up, rolling_col = 'measurement',
                           target_col = 'throughput')# function in my_defs

In [None]:
# function in my_defs
rolling_down = shift_group_features_(rolling_down, group_by ='measurement',
                                  shift_col1 = "throughput_mean", shift_col2 = "throughput_std",
                                    shift_col3 = "throughput_var")
# function in my_defs
rolling_up = shift_group_features_(rolling_up, group_by ='measurement',
                               shift_col1 = "throughput_mean", shift_col2 = "throughput_std",
                                    shift_col3 = "throughput_var")

In [None]:
data_down = pd.merge(data_down,rolling_down, how='inner')
data_up = pd.merge(data_up,rolling_up, how='inner')
data_down.head()

# Add address  as OneHot Feature 

In [None]:
'''
**This block of code may take quite some time till completion, depending on your internet speed 
(check my wall time in cell [25] before running it)

Don't want to wait ? :D 

Just navigate to the last cell of this section [29] and uncomment it to read and view our csv-files 
containing the one-hot-encoded addresses.
'''

In [None]:
data_up['coordinates'] = list(zip(data_up.lat, data_up.lon))
data_down['coordinates'] = list(zip(data_down.lat, data_down.lon))

In [None]:
%%time
data_up['dist'] = data_up['coordinates'].map(
    lambda x:get_district(x)) # function in my_defs

data_down['dist'] = data_down['coordinates'].map(
    lambda x:get_district(x)) # function in my_defs 

data_up = data_up.drop(["coordinates"], axis =1 )
data_down = data_down.drop(["coordinates"], axis =1 )

In [None]:
data_down

In [None]:
data_down = onehot_enc(data_down, "dist") # function in my_defs
data_up = onehot_enc(data_up, "dist") # function in my_defs

In [None]:
# saving the data files as data_file_fat.csv in order to save time for next runs!

data_down.to_csv("data_down_fat.csv", index= False)  #your oath here
data_up.to_csv("data_up_fat.csv", index = False)  #your oath here  

In [None]:
'''
UNCOMMENT if you don't wanna wait to download the districts
and check your path if you get an error
'''

#data_down = pd.read_csv("data_down_fat.csv", parse_dates = ["rawTimesamp", "date"])
#data_up= pd.read_csv("data_up_fat.csv", parse_dates = ["rawTimesamp", "date"])
#display(data_down.head())
#display(data_up.head())

# Pysical Cell ID Encoding

In [None]:
data_down = onehot_enc(data_down, 'pci')
data_up = onehot_enc(data_up, 'pci') 


In [None]:
# data_up_full.csv & data_down_full.csv  have all the generated features.
# all undesired features will be right before modeling dropped


data_up.to_csv("data_up_full.csv", index = False)  #your path here

data_down.to_csv("data_down_full.csv", index= False)  #your path here
