## Importing the Packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

## Importing the Data 

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,33.0000,65.0000,2020-01-22,0.0,0.0
1,2,,Afghanistan,33.0000,65.0000,2020-01-23,0.0,0.0
2,3,,Afghanistan,33.0000,65.0000,2020-01-24,0.0,0.0
3,4,,Afghanistan,33.0000,65.0000,2020-01-25,0.0,0.0
4,5,,Afghanistan,33.0000,65.0000,2020-01-26,0.0,0.0
...,...,...,...,...,...,...,...,...
17887,26378,,Zambia,-15.4167,28.2833,2020-03-20,2.0,0.0
17888,26379,,Zambia,-15.4167,28.2833,2020-03-21,2.0,0.0
17889,26380,,Zambia,-15.4167,28.2833,2020-03-22,3.0,0.0
17890,26381,,Zambia,-15.4167,28.2833,2020-03-23,3.0,0.0


In [4]:
test

Unnamed: 0,ForecastId,Province/State,Country/Region,Lat,Long,Date
0,1,,Afghanistan,33.0000,65.0000,2020-03-12
1,2,,Afghanistan,33.0000,65.0000,2020-03-13
2,3,,Afghanistan,33.0000,65.0000,2020-03-14
3,4,,Afghanistan,33.0000,65.0000,2020-03-15
4,5,,Afghanistan,33.0000,65.0000,2020-03-16
...,...,...,...,...,...,...
12207,12208,,Zambia,-15.4167,28.2833,2020-04-19
12208,12209,,Zambia,-15.4167,28.2833,2020-04-20
12209,12210,,Zambia,-15.4167,28.2833,2020-04-21
12210,12211,,Zambia,-15.4167,28.2833,2020-04-22


## Data Preprocessing

In [5]:
#Setting the Index

In [6]:
train.set_index('Id' , inplace=True)
test.set_index('ForecastId' , inplace=True)

In [7]:
train

Unnamed: 0_level_0,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,,Afghanistan,33.0000,65.0000,2020-01-22,0.0,0.0
2,,Afghanistan,33.0000,65.0000,2020-01-23,0.0,0.0
3,,Afghanistan,33.0000,65.0000,2020-01-24,0.0,0.0
4,,Afghanistan,33.0000,65.0000,2020-01-25,0.0,0.0
5,,Afghanistan,33.0000,65.0000,2020-01-26,0.0,0.0
...,...,...,...,...,...,...,...
26378,,Zambia,-15.4167,28.2833,2020-03-20,2.0,0.0
26379,,Zambia,-15.4167,28.2833,2020-03-21,2.0,0.0
26380,,Zambia,-15.4167,28.2833,2020-03-22,3.0,0.0
26381,,Zambia,-15.4167,28.2833,2020-03-23,3.0,0.0


In [8]:
test

Unnamed: 0_level_0,Province/State,Country/Region,Lat,Long,Date
ForecastId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,,Afghanistan,33.0000,65.0000,2020-03-12
2,,Afghanistan,33.0000,65.0000,2020-03-13
3,,Afghanistan,33.0000,65.0000,2020-03-14
4,,Afghanistan,33.0000,65.0000,2020-03-15
5,,Afghanistan,33.0000,65.0000,2020-03-16
...,...,...,...,...,...
12208,,Zambia,-15.4167,28.2833,2020-04-19
12209,,Zambia,-15.4167,28.2833,2020-04-20
12210,,Zambia,-15.4167,28.2833,2020-04-21
12211,,Zambia,-15.4167,28.2833,2020-04-22


In [9]:
#Dealing with Null Values

In [10]:
def missing_percentage(df):
    total = df.isnull().sum().sort_values(ascending = False)[df.isnull().sum().sort_values(ascending = False)!= 0]
    percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100 , 2)[round(df.isnull().sum().sort_values(ascending = False)/len(df)*100 , 2)!= 0]
    data_type = df.dtypes.sort_values(ascending = False)[df.isnull().sum().sort_values(ascending = False)!= 0]
    return pd.concat([total , percent , data_type],axis = 1 , keys = ['Total' , 'Percent' , 'Type'])

In [11]:
missing_percentage(train)

Unnamed: 0,Total,Percent,Type
Province/State,9702,54.23,object


In [12]:
train['Province/State'].value_counts()

Diamond Princess                126
Grand Princess                  126
Australian Capital Territory     63
Maine                            63
Iowa                             63
                               ... 
Jilin                            63
Jiangxi                          63
Jiangsu                          63
Inner Mongolia                   63
United Kingdom                   63
Name: Province/State, Length: 128, dtype: int64

In [13]:
train = train.drop(['Province/State'] , axis=1)

In [14]:
missing_percentage(test)

Unnamed: 0,Total,Percent,Type
Province/State,6622,54.23,object


In [15]:
test = test.drop(['Province/State'] , axis=1)

In [16]:
train

Unnamed: 0_level_0,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Afghanistan,33.0000,65.0000,2020-01-22,0.0,0.0
2,Afghanistan,33.0000,65.0000,2020-01-23,0.0,0.0
3,Afghanistan,33.0000,65.0000,2020-01-24,0.0,0.0
4,Afghanistan,33.0000,65.0000,2020-01-25,0.0,0.0
5,Afghanistan,33.0000,65.0000,2020-01-26,0.0,0.0
...,...,...,...,...,...,...
26378,Zambia,-15.4167,28.2833,2020-03-20,2.0,0.0
26379,Zambia,-15.4167,28.2833,2020-03-21,2.0,0.0
26380,Zambia,-15.4167,28.2833,2020-03-22,3.0,0.0
26381,Zambia,-15.4167,28.2833,2020-03-23,3.0,0.0


In [17]:
test

Unnamed: 0_level_0,Country/Region,Lat,Long,Date
ForecastId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Afghanistan,33.0000,65.0000,2020-03-12
2,Afghanistan,33.0000,65.0000,2020-03-13
3,Afghanistan,33.0000,65.0000,2020-03-14
4,Afghanistan,33.0000,65.0000,2020-03-15
5,Afghanistan,33.0000,65.0000,2020-03-16
...,...,...,...,...
12208,Zambia,-15.4167,28.2833,2020-04-19
12209,Zambia,-15.4167,28.2833,2020-04-20
12210,Zambia,-15.4167,28.2833,2020-04-21
12211,Zambia,-15.4167,28.2833,2020-04-22


In [18]:
train['ConfirmedCases'].value_counts()

0.0       11022
1.0         874
2.0         347
3.0         309
4.0         203
          ...  
1048.0        1
1075.0        1
1092.0        1
1117.0        1
8077.0        1
Name: ConfirmedCases, Length: 1023, dtype: int64

In [19]:
train['Fatalities'].value_counts()

0.0       15424
1.0         899
2.0         365
3.0         244
6.0         190
          ...  
3160.0        1
243.0         1
450.0         1
562.0         1
422.0         1
Name: Fatalities, Length: 204, dtype: int64

## Splitting the Data (Input , Output)

In [20]:
x = train.iloc[:,:-2]
y1 = train.iloc[:,-2:-1]
y2 = train.iloc[:,-1:]

In [21]:
x

Unnamed: 0_level_0,Country/Region,Lat,Long,Date
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Afghanistan,33.0000,65.0000,2020-01-22
2,Afghanistan,33.0000,65.0000,2020-01-23
3,Afghanistan,33.0000,65.0000,2020-01-24
4,Afghanistan,33.0000,65.0000,2020-01-25
5,Afghanistan,33.0000,65.0000,2020-01-26
...,...,...,...,...
26378,Zambia,-15.4167,28.2833,2020-03-20
26379,Zambia,-15.4167,28.2833,2020-03-21
26380,Zambia,-15.4167,28.2833,2020-03-22
26381,Zambia,-15.4167,28.2833,2020-03-23


In [22]:
y1

Unnamed: 0_level_0,ConfirmedCases
Id,Unnamed: 1_level_1
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
...,...
26378,2.0
26379,2.0
26380,3.0
26381,3.0


In [23]:
y2

Unnamed: 0_level_0,Fatalities
Id,Unnamed: 1_level_1
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
...,...
26378,0.0
26379,0.0
26380,0.0
26381,0.0


## Data Encoding

In [24]:
#train

In [25]:
x_obj = x.select_dtypes(include=["object"])
x_non_obj = x.select_dtypes(exclude=["object"])

In [26]:
x_obj

Unnamed: 0_level_0,Country/Region,Date
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Afghanistan,2020-01-22
2,Afghanistan,2020-01-23
3,Afghanistan,2020-01-24
4,Afghanistan,2020-01-25
5,Afghanistan,2020-01-26
...,...,...
26378,Zambia,2020-03-20
26379,Zambia,2020-03-21
26380,Zambia,2020-03-22
26381,Zambia,2020-03-23


In [27]:
x_non_obj

Unnamed: 0_level_0,Lat,Long
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,33.0000,65.0000
2,33.0000,65.0000
3,33.0000,65.0000
4,33.0000,65.0000
5,33.0000,65.0000
...,...,...
26378,-15.4167,28.2833
26379,-15.4167,28.2833
26380,-15.4167,28.2833
26381,-15.4167,28.2833


In [28]:
la = LabelEncoder()

In [29]:
for i in range(x_obj.shape[1]):
    x_obj.iloc[:,i]=la.fit_transform(x_obj.iloc[:,i])

In [30]:
X = pd.concat([x_non_obj,x_obj] , axis = 1)

In [31]:
X

Unnamed: 0_level_0,Lat,Long,Country/Region,Date
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,33.0000,65.0000,0,0
2,33.0000,65.0000,0,1
3,33.0000,65.0000,0,2
4,33.0000,65.0000,0,3
5,33.0000,65.0000,0,4
...,...,...,...,...
26378,-15.4167,28.2833,162,58
26379,-15.4167,28.2833,162,59
26380,-15.4167,28.2833,162,60
26381,-15.4167,28.2833,162,61


In [32]:
#test

In [33]:
test_obj = test.select_dtypes(include=["object"])
test_non_obj = test.select_dtypes(exclude=["object"])

In [34]:
test_obj

Unnamed: 0_level_0,Country/Region,Date
ForecastId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Afghanistan,2020-03-12
2,Afghanistan,2020-03-13
3,Afghanistan,2020-03-14
4,Afghanistan,2020-03-15
5,Afghanistan,2020-03-16
...,...,...
12208,Zambia,2020-04-19
12209,Zambia,2020-04-20
12210,Zambia,2020-04-21
12211,Zambia,2020-04-22


In [35]:
test_non_obj

Unnamed: 0_level_0,Lat,Long
ForecastId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,33.0000,65.0000
2,33.0000,65.0000
3,33.0000,65.0000
4,33.0000,65.0000
5,33.0000,65.0000
...,...,...
12208,-15.4167,28.2833
12209,-15.4167,28.2833
12210,-15.4167,28.2833
12211,-15.4167,28.2833


In [36]:
for i in range(test_obj.shape[1]):
    test_obj.iloc[:,i]=la.fit_transform(test_obj.iloc[:,i])

In [37]:
Test = pd.concat([test_non_obj,test_obj] , axis = 1)

In [38]:
Test

Unnamed: 0_level_0,Lat,Long,Country/Region,Date
ForecastId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,33.0000,65.0000,0,0
2,33.0000,65.0000,0,1
3,33.0000,65.0000,0,2
4,33.0000,65.0000,0,3
5,33.0000,65.0000,0,4
...,...,...,...,...
12208,-15.4167,28.2833,162,38
12209,-15.4167,28.2833,162,39
12210,-15.4167,28.2833,162,40
12211,-15.4167,28.2833,162,41


## Data Normalization

In [39]:
#train

In [40]:
train_scaler = StandardScaler().fit(X)

In [41]:
X_scaled = train_scaler.transform(X)

In [42]:
X_scaled

array([[ 0.29267353,  0.7536666 , -1.57518545, -1.70477271],
       [ 0.29267353,  0.7536666 , -1.57518545, -1.64978004],
       [ 0.29267353,  0.7536666 , -1.57518545, -1.59478738],
       ...,
       [-1.81841693,  0.29425434,  1.38705171,  1.59478738],
       [-1.81841693,  0.29425434,  1.38705171,  1.64978004],
       [-1.81841693,  0.29425434,  1.38705171,  1.70477271]])

In [43]:
X_scaled.mean(axis=0)      #Equal almost "0"

array([-1.23904166e-16, -5.08324783e-17, -1.52497435e-16, -8.93539658e-19])

In [44]:
X_scaled.std(axis = 0)     #Equal "1"

array([1., 1., 1., 1.])

In [45]:
#test

In [46]:
test_scaler = StandardScaler().fit(Test)

In [47]:
Test_scaled = test_scaler.transform(Test)

In [48]:
Test_scaled

array([[ 0.29267353,  0.7536666 , -1.57518545, -1.69222822],
       [ 0.29267353,  0.7536666 , -1.57518545, -1.61164593],
       [ 0.29267353,  0.7536666 , -1.57518545, -1.53106363],
       ...,
       [-1.81841693,  0.29425434,  1.38705171,  1.53106363],
       [-1.81841693,  0.29425434,  1.38705171,  1.61164593],
       [-1.81841693,  0.29425434,  1.38705171,  1.69222822]])

## Modeling (DecisionTreeRegressor)

In [49]:
model = DecisionTreeRegressor()

## Fitting the Model and Making Predictions

In [50]:
model.fit(X_scaled , y1)

DecisionTreeRegressor()

In [51]:
model.score(X_scaled , y1)

1.0

In [52]:
predict_1 = model.predict(Test_scaled)

In [53]:
predict_1

array([0., 0., 0., ..., 2., 3., 3.])

In [54]:
model.fit (X_scaled , y2)

DecisionTreeRegressor()

In [55]:
model.score(X_scaled , y2)

1.0

In [56]:
predict_2 = model.predict(Test_scaled)

In [57]:
predict_2

array([0., 0., 0., ..., 0., 0., 0.])

## Saving Predicted Values

In [58]:
Submission = Test

In [59]:
Submission

Unnamed: 0_level_0,Lat,Long,Country/Region,Date
ForecastId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,33.0000,65.0000,0,0
2,33.0000,65.0000,0,1
3,33.0000,65.0000,0,2
4,33.0000,65.0000,0,3
5,33.0000,65.0000,0,4
...,...,...,...,...
12208,-15.4167,28.2833,162,38
12209,-15.4167,28.2833,162,39
12210,-15.4167,28.2833,162,40
12211,-15.4167,28.2833,162,41


In [60]:
Submission['ConfirmedCases'] = predict_1

In [61]:
Submission['Fatalities'] = predict_2

In [62]:
Submission

Unnamed: 0_level_0,Lat,Long,Country/Region,Date,ConfirmedCases,Fatalities
ForecastId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,33.0000,65.0000,0,0,0.0,0.0
2,33.0000,65.0000,0,1,0.0,0.0
3,33.0000,65.0000,0,2,0.0,0.0
4,33.0000,65.0000,0,3,0.0,0.0
5,33.0000,65.0000,0,4,0.0,0.0
...,...,...,...,...,...,...
12208,-15.4167,28.2833,162,38,2.0,0.0
12209,-15.4167,28.2833,162,39,2.0,0.0
12210,-15.4167,28.2833,162,40,2.0,0.0
12211,-15.4167,28.2833,162,41,3.0,0.0


In [63]:
Submission = Submission.drop(['Lat' , 'Long' , 'Country/Region' , 'Date'] , axis = 1)

In [64]:
Submission

Unnamed: 0_level_0,ConfirmedCases,Fatalities
ForecastId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,0.0
...,...,...
12208,2.0,0.0
12209,2.0,0.0
12210,2.0,0.0
12211,3.0,0.0


In [65]:
Submission = Submission.to_csv("Submission.csv")