#Preparasi Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('/content/covid_worldwide.csv',usecols=['Total Cases','Total Deaths','Total Recovered','Active Cases','Total Test','Population'])
df.head()

Unnamed: 0,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Test,Population
0,104196861,1132935,101322779,1741147,1159832679,334805269
1,44682784,530740,44150289,1755,915265788,1406631776
2,39524311,164233,39264546,95532,271490188,65584518
3,37779833,165711,37398100,216022,122332384,83883596
4,36824580,697074,35919372,208134,63776166,215353593


In [3]:
df.dtypes

Total Cases        object
Total Deaths       object
Total Recovered    object
Active Cases       object
Total Test         object
Population         object
dtype: object

In [4]:
def clean_currency(x):
    """ If the value is a string, then remove currency symbol and delimiters
    otherwise, the value is numeric and can be converted
    """
    if isinstance(x, str):
        return(x.replace(',', ''))
    return(x)

In [5]:
df['Total Cases']=df['Total Cases'].apply(clean_currency).astype('float')
df['Total Deaths']=df['Total Deaths'].apply(clean_currency).astype('float')
df['Total Recovered']=df['Total Recovered'].apply(clean_currency).astype('float')
df['Active Cases']=df['Active Cases'].apply(clean_currency).astype('float')
df['Total Test']=df['Total Test'].apply(clean_currency).astype('float')
df['Population']=df['Population'].apply(clean_currency).astype('float')

In [6]:
df.dtypes

Total Cases        float64
Total Deaths       float64
Total Recovered    float64
Active Cases       float64
Total Test         float64
Population         float64
dtype: object

In [7]:
df.describe()

Unnamed: 0,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Test,Population
count,231.0,225.0,210.0,212.0,213.0,228.0
mean,2923460.0,30057.78,2993905.0,90998.85,32493160.0,28493260.0
std,9479286.0,105380.6,9520209.0,766388.8,117737000.0,102280300.0
min,5.0,1.0,2.0,0.0,7850.0,799.0
25%,24001.0,223.0,21972.5,52.5,347815.0,445081.5
50%,206592.0,2179.0,235145.5,1115.0,2216193.0,5676382.0
75%,1296146.0,14452.0,1465768.0,10228.75,12984260.0,21707590.0
max,104196900.0,1132935.0,101322800.0,10952620.0,1159833000.0,1406632000.0


In [8]:
df.isnull().sum()

Total Cases         0
Total Deaths        6
Total Recovered    21
Active Cases       19
Total Test         18
Population          3
dtype: int64

In [9]:
df = df.reset_index()
df_onecolumn = pd.melt(df, id_vars=['index'])
df_sampled = df_onecolumn.sample(frac=0.50).reset_index(drop=True)
df_fraction = df_sampled.pivot(index='index', columns='variable', values='value')

df_fraction

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1741147.0,,,1132935.0,,1.159833e+09
1,1755.0,,,,,9.152658e+08
2,95532.0,65584518.0,,,39264546.0,2.714902e+08
3,216022.0,,,165711.0,37398100.0,
4,,,36824580.0,,35919372.0,6.377617e+07
...,...,...,...,...,...,...
226,,,,,699.0,
227,,799.0,,,29.0,
228,0.0,626161.0,,1.0,9.0,
229,0.0,,9.0,,,


In [10]:
df_fraction.isnull().sum()

variable
Active Cases       121
Population         109
Total Cases        111
Total Deaths       116
Total Recovered    109
Total Test         126
dtype: int64

#Imputasi dengan mean

In [11]:
df1=df_fraction.copy()
df1.isnull().sum()

variable
Active Cases       121
Population         109
Total Cases        111
Total Deaths       116
Total Recovered    109
Total Test         126
dtype: int64

In [12]:
df1['Total Cases']=df1['Total Cases'].fillna(df1['Total Cases'].mean())
df1['Total Deaths']=df1['Total Deaths'].fillna(df1['Total Deaths'].mean())
df1['Total Recovered']=df1['Total Recovered'].fillna(df1['Total Recovered'].mean())
df1['Active Cases']=df1['Active Cases'].fillna(df1['Active Cases'].mean())
df1['Total Test']=df1['Total Test'].fillna(df1['Total Test'].mean())
df1['Population']=df1['Population'].fillna(df1['Population'].mean())

In [13]:
df1.isnull().sum()

variable
Active Cases       0
Population         0
Total Cases        0
Total Deaths       0
Total Recovered    0
Total Test         0
dtype: int64

In [14]:
df1.describe()

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test
count,226.0,226.0,226.0,226.0,226.0,226.0
mean,146861.2,18408870.0,2472580.0,32743.87,2709295.0,39488710.0
std,736043.8,25185910.0,4592688.0,82846.08,5081252.0,101002700.0
min,0.0,799.0,5.0,1.0,9.0,19690.0
25%,1150.75,5017558.0,192579.0,2261.75,167248.0,2729571.0
50%,146861.2,18408870.0,2472580.0,32743.87,2709295.0,39488710.0
75%,146861.2,18408870.0,2472580.0,32743.87,2709295.0,39488710.0
max,10952620.0,279134500.0,36824580.0,1132935.0,39264550.0,1159833000.0


#imputasi dengan median

In [15]:
df2=df_fraction.copy()
df2.isnull().sum()

variable
Active Cases       121
Population         109
Total Cases        111
Total Deaths       116
Total Recovered    109
Total Test         126
dtype: int64

In [16]:
df2['Total Cases']=df2['Total Cases'].fillna(df2['Total Cases'].median())
df2['Total Deaths']=df2['Total Deaths'].fillna(df2['Total Deaths'].median())
df2['Total Recovered']=df2['Total Recovered'].fillna(df2['Total Recovered'].median())
df2['Active Cases']=df2['Active Cases'].fillna(df2['Active Cases'].median())
df2['Total Test']=df2['Total Test'].fillna(df2['Total Test'].median())
df2['Population']=df2['Population'].fillna(df2['Population'].median())

In [17]:
df2.isnull().sum()

variable
Active Cases       0
Population         0
Total Cases        0
Total Deaths       0
Total Recovered    0
Total Test         0
dtype: int64

In [18]:
df2.describe()

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test
count,226.0,226.0,226.0,226.0,226.0,226.0
mean,68751.84,12029710.0,1359639.0,17000.79,1490740.0,18655030.0
std,739647.4,26042360.0,4730937.0,84258.79,5236417.0,102701300.0
min,0.0,799.0,5.0,1.0,9.0,19690.0
25%,971.0,5017558.0,192579.0,2072.0,167248.0,2120364.0
50%,971.0,5182354.0,206592.0,2072.0,182749.0,2120364.0
75%,971.0,5335435.0,223913.2,2072.0,193120.5,2120364.0
max,10952620.0,279134500.0,36824580.0,1132935.0,39264550.0,1159833000.0


#imputasi dengan modus

In [19]:
import statistics

In [20]:
df3=df_fraction.copy()
df3.isnull().sum()

variable
Active Cases       121
Population         109
Total Cases        111
Total Deaths       116
Total Recovered    109
Total Test         126
dtype: int64

In [21]:
df3['Total Cases']=df3['Total Cases'].fillna(statistics.mode(df3['Total Cases']))
df3['Total Deaths']=df3['Total Deaths'].fillna(statistics.mode(df3['Total Deaths']))
df3['Total Recovered']=df3['Total Recovered'].fillna(statistics.mode(df3['Total Recovered']))
df3['Active Cases']=df3['Active Cases'].fillna(statistics.mode(df3['Active Cases']))
df3['Total Test']=df3['Total Test'].fillna(statistics.mode(df3['Total Test']))
df3['Population']=df3['Population'].fillna(statistics.mode(df3['Population']))

In [22]:
df3.isnull().sum()

variable
Active Cases         0
Population         109
Total Cases        111
Total Deaths         0
Total Recovered    109
Total Test           0
dtype: int64

In [23]:
df3.describe()

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test
count,226.0,117.0,115.0,226.0,117.0,226.0
mean,68231.96,18408870.0,2472580.0,15975.27,2709295.0,664105300.0
std,739695.4,35076800.0,6452170.0,84447.03,7076736.0,566760300.0
min,0.0,799.0,5.0,1.0,9.0,19690.0
25%,0.0,626161.0,27589.0,74.0,21596.0,2729571.0
50%,0.0,5182354.0,206592.0,74.0,182749.0,1159833000.0
75%,528.5,19205040.0,1250440.0,1878.75,1282528.0,1159833000.0
max,10952620.0,279134500.0,36824580.0,1132935.0,39264550.0,1159833000.0


#Imputasi dengan regresi linier

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [25]:
df4=df_fraction.copy()
df4.isnull().sum()

variable
Active Cases       121
Population         109
Total Cases        111
Total Deaths       116
Total Recovered    109
Total Test         126
dtype: int64

In [26]:
missing_columns=['Total Cases','Total Deaths','Total Recovered','Active Cases','Total Test','Population']

In [27]:
def mean_imputation(df, feature):
    df.loc[df[feature].isnull(), feature + '_imp'] = df[feature].mean()
    
    return df

In [28]:
for feature in missing_columns:
    df4[feature + '_imp'] = df4[feature]
    df4 = mean_imputation(df4, feature)

In [29]:
df4.tail()

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test,Total Cases_imp,Total Deaths_imp,Total Recovered_imp,Active Cases_imp,Total Test_imp,Population_imp
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
226,,,,,699.0,,2472580.0,32743.872727,699.0,146861.180952,39488707.84,18408870.0
227,,799.0,,,29.0,,2472580.0,32743.872727,29.0,146861.180952,39488707.84,799.0
228,0.0,626161.0,,1.0,9.0,,2472580.0,1.0,9.0,0.0,39488707.84,626161.0
229,0.0,,9.0,,,,9.0,32743.872727,2709295.0,0.0,39488707.84,18408870.0
230,5.0,1378.0,5.0,,,,5.0,32743.872727,2709295.0,5.0,39488707.84,1378.0


In [30]:
deter_data = pd.DataFrame(columns = ["Det" + name for name in missing_columns])
for feature in missing_columns:
        
    deter_data["Det" + feature] = df4[feature + "_imp"]
    parameters = list(set(df4.columns) - set(missing_columns) - {feature + '_imp'})
    model = LinearRegression()
    model.fit(X = df4[parameters], y = df4[feature + '_imp'])
    deter_data.loc[df4[feature].isnull(), "Det" + feature] = model.predict(df4[parameters])[df4[feature].isnull()]

In [31]:
deter_data.tail()

Unnamed: 0_level_0,DetTotal Cases,DetTotal Deaths,DetTotal Recovered,DetActive Cases,DetTotal Test,DetPopulation
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
226,1269294.0,32582.801908,699.0,155342.64037,34733700.0,17112750.0
227,801838.5,24162.501392,29.0,214861.304219,40324820.0,799.0
228,579621.4,1.0,9.0,0.0,12780100.0,626161.0
229,9.0,32956.380729,1140742.0,0.0,40282400.0,15877130.0
230,5.0,24536.428456,888724.8,5.0,45874530.0,1378.0


In [32]:
deter_data.isnull().sum()

DetTotal Cases        0
DetTotal Deaths       0
DetTotal Recovered    0
DetActive Cases       0
DetTotal Test         0
DetPopulation         0
dtype: int64

#Imputas dengan KNN

In [33]:
from sklearn.impute import KNNImputer

In [34]:
#mengcopy dataset ke df5 dan mengecek nilai null
df5 = df_fraction.copy()
df5.isnull().sum()

variable
Active Cases       121
Population         109
Total Cases        111
Total Deaths       116
Total Recovered    109
Total Test         126
dtype: int64

In [35]:
#memasukkan df5 kedalam dataframe
df5 = pd.DataFrame(df5)
print("Data sebelum imputasi\n",df5)

#menggunakan KNN untuk mencari missing values
imputer = KNNImputer(n_neighbors=2)
imputasi_knn = imputer.fit_transform(df5)
print("\n\nData setelah imputasi\n",imputasi_knn)

Data sebelum imputasi
 variable  Active Cases  Population  Total Cases  Total Deaths  \
index                                                           
0            1741147.0         NaN          NaN     1132935.0   
1               1755.0         NaN          NaN           NaN   
2              95532.0  65584518.0          NaN           NaN   
3             216022.0         NaN          NaN      165711.0   
4                  NaN         NaN   36824580.0           NaN   
...                ...         ...          ...           ...   
226                NaN         NaN          NaN           NaN   
227                NaN       799.0          NaN           NaN   
228                0.0    626161.0          NaN           1.0   
229                0.0         NaN          9.0           NaN   
230                5.0      1378.0          5.0           NaN   

variable  Total Recovered    Total Test  
index                                    
0                     NaN  1.159833e+09  
1    

In [36]:
#memasukkan hasil imputasi dari matrix ke dataframe
kolom = ['Total Cases', 'Total Deaths', 'Total Recovered', 'Active Cases','Total Test', 'Population']
imputasi_knn = pd.DataFrame(data = imputasi_knn, columns = kolom)

In [37]:
imputasi_knn

Unnamed: 0,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Test,Population
0,1741147.0,122379744.0,12628993.0,1132935.0,10447624.0,1.159833e+09
1,1755.0,75033.0,6048.5,3813.5,172790.0,9.152658e+08
2,95532.0,65584518.0,7068.5,871.5,39264546.0,2.714902e+08
3,216022.0,64377049.0,3299385.0,165711.0,37398100.0,9.767359e+07
4,5584320.0,53443163.0,36824580.0,184941.0,35919372.0,6.377617e+07
...,...,...,...,...,...,...
221,0.5,5890.5,1075.0,7.5,699.0,6.889100e+04
222,3.0,799.0,376.0,35.5,29.0,6.889100e+04
223,0.0,626161.0,335999.0,1.0,9.0,3.257576e+06
224,0.0,14826461.5,9.0,37.5,2391412.5,3.166730e+05


In [38]:
df5.tail()

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
226,,,,,699.0,
227,,799.0,,,29.0,
228,0.0,626161.0,,1.0,9.0,
229,0.0,,9.0,,,
230,5.0,1378.0,5.0,,,


In [39]:
#mengecek apakah masih ada data yang mengandung missing values
imputasi_knn.isnull().sum()

Total Cases        0
Total Deaths       0
Total Recovered    0
Active Cases       0
Total Test         0
Population         0
dtype: int64