# Data Exploration
In this notebook I am going to explore the given data to get a better sense about the behavior of data. The steps are as follow

* Loading data
* Calculate the number of features`n` & the number of instances `m`
* Knowing types of features
* Cleaning data
* Handling missing values
* Number of Unique categorical values for each feature

In [120]:
import pandas as pd
train = pd.read_csv("data.csv")

In [121]:
train.head()

Unnamed: 0.1,Unnamed: 0,Episode,Station,Channel Type,Season,Year,Date,Day of week,Start_time,End_time,Length,Name of show,Name of episode,Genre,First time or rerun,# of episode in the season,Movie?,Game of the Canadiens during episode?,Market Share_total,Temperature in Montreal during episode
0,1,Vidéoclips V,V Total,General Channel,Fall,2016,2016-08-29,Monday,2016-08-29 06:00:00,2016-08-29 08:00:00,8,Vidéoclips V,,Music Video Clips,No,Yes,No,No,0.9,20.4
1,2,Apollo dans l'frigo,V Total,General Channel,Fall,2016,2016-08-29,Monday,2016-08-29 08:00:00,2016-08-29 08:30:00,2,Apollo dans l'frigo,,Informal Education and Recreation and Leisure,No,Yes,No,No,0.5,19.125
2,3,Infopublicité,V Total,General Channel,Fall,2016,2016-08-29,Monday,2016-08-29 08:30:00,2016-08-29 09:00:00,2,Infopublicité,,"Infomercials, Promotional and Corporate Videos",No,Yes,No,No,0.3,19.125
3,4,"Infos, Les",V Total,General Channel,Fall,2016,2016-08-29,Monday,2016-08-29 09:00:00,2016-08-29 10:00:00,4,"Infos, Les",,News,No,Yes,No,No,1.7,18.125
4,5,"Souper presque parfait, Un",V Total,General Channel,Fall,2016,2016-08-29,Monday,2016-08-29 10:00:00,2016-08-29 10:30:00,2,"Souper presque parfait, Un",,Reality Programs,No,Yes,No,No,2.2,18.25


In [122]:
# columns
train.columns

Index(['Unnamed: 0', 'Episode', 'Station', 'Channel Type', 'Season', 'Year',
       'Date', 'Day of week', 'Start_time', 'End_time', 'Length',
       'Name of show', 'Name of episode', 'Genre', 'First time or rerun',
       '# of episode in the season', 'Movie?',
       'Game of the Canadiens during episode?', 'Market Share_total',
       'Temperature in Montreal during episode'],
      dtype='object')

In [123]:
# number of features & number of instances 
train.shape

(616656, 20)

In [124]:
data = [[train['Market Share_total'].dtype,int(train['Market Share_total'].min()), int(train['Market Share_total'].max())], 
        [train['Length'].dtype, int(train['Length'].min()), int(train['Length'].max())],
        [train['Temperature in Montreal during episode'].dtype, int(train['Temperature in Montreal during episode'].min()), int(train['Temperature in Montreal during episode'].max())],
        [train['Year'].dtype, int(train['Year'].min()), int(train['Year'].max())]]
pd.DataFrame(data, columns=["type", "min", "max"], index = ['Market Share', 'Length', 'Temperature', 'Year'])      

Unnamed: 0,type,min,max
Market Share,float64,0,89
Length,int64,0,92
Temperature,float64,-27,34
Year,int64,2016,2019


In [125]:
# checking are "Episode" & "Name of show" equal?
train['Episode'].equals(train['Name of show'])

True

In [126]:
# droping unnecessary features
train = train.drop(['Unnamed: 0', 'Name of show'], axis = 1)
train.head(1)

Unnamed: 0,Episode,Station,Channel Type,Season,Year,Date,Day of week,Start_time,End_time,Length,Name of episode,Genre,First time or rerun,# of episode in the season,Movie?,Game of the Canadiens during episode?,Market Share_total,Temperature in Montreal during episode
0,Vidéoclips V,V Total,General Channel,Fall,2016,2016-08-29,Monday,2016-08-29 06:00:00,2016-08-29 08:00:00,8,,Music Video Clips,No,Yes,No,No,0.9,20.4


In [127]:
x = train.isna().sum()
# replace NAN values with "No value"
train.fillna("No value", inplace = True)
data = []
names = train.columns
for i in range(train.shape[1]):
    data.append([x[i], train[train.columns[i]].isna().sum()])
# show number of NaN's before and after replacement for each feature
pd.DataFrame(data, columns =["#before", "#after"], index = names)

Unnamed: 0,#before,#after
Episode,0,0
Station,0,0
Channel Type,0,0
Season,0,0
Year,0,0
Date,0,0
Day of week,0,0
Start_time,43,0
End_time,43,0
Length,0,0


In [128]:
data = []
names = train.columns
for i in range(train.shape[1]):
    data.append([train[train.columns[i]].dtype, len(train[train.columns[i]].unique())])
pd.DataFrame(data, columns =["type", "#unique"], index = names)

Unnamed: 0,type,#unique
Episode,object,6687
Station,object,24
Channel Type,object,2
Season,object,4
Year,int64,4
Date,object,877
Day of week,object,7
Start_time,object,138322
End_time,object,138334
Length,int64,39
