#Preparasi Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('/content/covid_worldwide.csv',usecols=['Total Cases','Total Deaths','Total Recovered','Active Cases','Total Test','Population'])
df.tail()

Unnamed: 0,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Test,Population
226,712,13.0,699.0,0,,
227,29,,29.0,0,,799.0
228,10,1.0,9.0,0,,626161.0
229,9,2.0,7.0,0,,
230,5,,,5,,1378.0


In [3]:
df.dtypes

Total Cases        object
Total Deaths       object
Total Recovered    object
Active Cases       object
Total Test         object
Population         object
dtype: object

In [4]:
def clean_currency(x):
    """ If the value is a string, then remove currency symbol and delimiters
    otherwise, the value is numeric and can be converted
    """
    if isinstance(x, str):
        return(x.replace(',', ''))
    return(x)

In [5]:
df['Total Cases']=df['Total Cases'].apply(clean_currency).astype('float')
df['Total Deaths']=df['Total Deaths'].apply(clean_currency).astype('float')
df['Total Recovered']=df['Total Recovered'].apply(clean_currency).astype('float')
df['Active Cases']=df['Active Cases'].apply(clean_currency).astype('float')
df['Total Test']=df['Total Test'].apply(clean_currency).astype('float')
df['Population']=df['Population'].apply(clean_currency).astype('float')

In [6]:
df.dtypes

Total Cases        float64
Total Deaths       float64
Total Recovered    float64
Active Cases       float64
Total Test         float64
Population         float64
dtype: object

In [7]:
df.describe()

Unnamed: 0,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Test,Population
count,231.0,225.0,210.0,212.0,213.0,228.0
mean,2923460.0,30057.78,2993905.0,90998.85,32493160.0,28493260.0
std,9479286.0,105380.6,9520209.0,766388.8,117737000.0,102280300.0
min,5.0,1.0,2.0,0.0,7850.0,799.0
25%,24001.0,223.0,21972.5,52.5,347815.0,445081.5
50%,206592.0,2179.0,235145.5,1115.0,2216193.0,5676382.0
75%,1296146.0,14452.0,1465768.0,10228.75,12984260.0,21707590.0
max,104196900.0,1132935.0,101322800.0,10952620.0,1159833000.0,1406632000.0


In [8]:
df.isnull().sum()

Total Cases         0
Total Deaths        6
Total Recovered    21
Active Cases       19
Total Test         18
Population          3
dtype: int64

In [9]:
df = df.reset_index()
df_onecolumn = pd.melt(df, id_vars=['index'])
df_sampled = df_onecolumn.sample(frac=0.50).reset_index(drop=True)
df_fraction = df_sampled.pivot(index='index', columns='variable', values='value')

df_fraction

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1741147.0,3.348053e+08,104196861.0,1132935.0,101322779.0,1.159833e+09
1,,1.406632e+09,44682784.0,,44150289.0,9.152658e+08
2,95532.0,6.558452e+07,39524311.0,,,2.714902e+08
3,,,,,,1.223324e+08
4,,,36824580.0,,,
...,...,...,...,...,...,...
226,,,712.0,13.0,,
227,,,,,29.0,
228,0.0,6.261610e+05,10.0,,9.0,
229,,,,2.0,,


In [10]:
df_fraction.isnull().sum()

variable
Active Cases       117
Population         112
Total Cases        110
Total Deaths       119
Total Recovered    125
Total Test         124
dtype: int64

In [11]:
df=df.drop(['index'],axis=1)
df

Unnamed: 0,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Test,Population
0,104196861.0,1132935.0,101322779.0,1741147.0,1.159833e+09,3.348053e+08
1,44682784.0,530740.0,44150289.0,1755.0,9.152658e+08,1.406632e+09
2,39524311.0,164233.0,39264546.0,95532.0,2.714902e+08,6.558452e+07
3,37779833.0,165711.0,37398100.0,216022.0,1.223324e+08,8.388360e+07
4,36824580.0,697074.0,35919372.0,208134.0,6.377617e+07,2.153536e+08
...,...,...,...,...,...,...
226,712.0,13.0,699.0,0.0,,
227,29.0,,29.0,0.0,,7.990000e+02
228,10.0,1.0,9.0,0.0,,6.261610e+05
229,9.0,2.0,7.0,0.0,,


#Imputasi dengan mean

In [12]:
df1=df_fraction.copy()
df1.isnull().sum()

variable
Active Cases       117
Population         112
Total Cases        110
Total Deaths       119
Total Recovered    125
Total Test         124
dtype: int64

In [13]:
df1['Total Cases']=df1['Total Cases'].fillna(df1['Total Cases'].mean())
df1['Total Deaths']=df1['Total Deaths'].fillna(df1['Total Deaths'].mean())
df1['Total Recovered']=df1['Total Recovered'].fillna(df1['Total Recovered'].mean())
df1['Active Cases']=df1['Active Cases'].fillna(df1['Active Cases'].mean())
df1['Total Test']=df1['Total Test'].fillna(df1['Total Test'].mean())
df1['Population']=df1['Population'].fillna(df1['Population'].mean())

In [14]:
df1.isnull().sum()

variable
Active Cases       0
Population         0
Total Cases        0
Total Deaths       0
Total Recovered    0
Total Test         0
dtype: int64

In [15]:
df1.describe()

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test
count,228.0,228.0,228.0,228.0,228.0,228.0
mean,39125.5,31678050.0,3804127.0,38050.17,3080496.0,40320410.0
std,133192.0,96405140.0,8667280.0,85198.72,7535847.0,102878000.0
min,0.0,3539.0,5.0,2.0,9.0,7850.0
25%,664.25,5737094.0,208056.8,4369.5,479234.2,3898638.0
50%,39125.5,31678050.0,3804127.0,38050.17,3080496.0,40320410.0
75%,39125.5,31678050.0,3804127.0,38050.17,3080496.0,40320410.0
max,1741147.0,1406632000.0,104196900.0,1132935.0,101322800.0,1159833000.0


#imputasi dengan median

In [16]:
df2=df_fraction.copy()
df2.isnull().sum()

variable
Active Cases       117
Population         112
Total Cases        110
Total Deaths       119
Total Recovered    125
Total Test         124
dtype: int64

In [17]:
df2['Total Cases']=df2['Total Cases'].fillna(df2['Total Cases'].median())
df2['Total Deaths']=df2['Total Deaths'].fillna(df2['Total Deaths'].median())
df2['Total Recovered']=df2['Total Recovered'].fillna(df2['Total Recovered'].median())
df2['Active Cases']=df2['Active Cases'].fillna(df2['Active Cases'].median())
df2['Total Test']=df2['Total Test'].fillna(df2['Total Test'].median())
df2['Population']=df2['Population'].fillna(df2['Population'].median())

In [18]:
df2.isnull().sum()

variable
Active Cases       0
Population         0
Total Cases        0
Total Deaths       0
Total Recovered    0
Total Test         0
dtype: int64

In [19]:
df2.describe()

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test
count,228.0,228.0,228.0,228.0,228.0,228.0
mean,19363.54,18974070.0,2074551.0,20302.9,1576180.0,19861070.0
std,134581.8,97272000.0,8851264.0,86882.63,7659105.0,104577700.0
min,0.0,3539.0,5.0,2.0,9.0,7850.0
25%,615.0,5737094.0,208056.8,4047.0,336623.0,2701618.0
50%,615.0,5816378.0,219187.5,4047.0,336623.0,2701618.0
75%,615.0,5862099.0,274286.5,4047.0,336623.0,2701618.0
max,1741147.0,1406632000.0,104196900.0,1132935.0,101322800.0,1159833000.0


#imputasi dengan modus

In [20]:
import statistics

In [21]:
df3=df_fraction.copy()
df3.isnull().sum()

variable
Active Cases       117
Population         112
Total Cases        110
Total Deaths       119
Total Recovered    125
Total Test         124
dtype: int64

In [22]:
df3['Total Cases']=df3['Total Cases'].fillna(statistics.mode(df3['Total Cases']))
df3['Total Deaths']=df3['Total Deaths'].fillna(statistics.mode(df3['Total Deaths']))
df3['Total Recovered']=df3['Total Recovered'].fillna(statistics.mode(df3['Total Recovered']))
df3['Active Cases']=df3['Active Cases'].fillna(statistics.mode(df3['Active Cases']))
df3['Total Test']=df3['Total Test'].fillna(statistics.mode(df3['Total Test']))
df3['Population']=df3['Population'].fillna(statistics.mode(df3['Population']))

In [23]:
df3.isnull().sum()

variable
Active Cases       0
Population         0
Total Cases        0
Total Deaths       0
Total Recovered    0
Total Test         0
dtype: int64

In [24]:
df3.describe()

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test
count,228.0,228.0,228.0,228.0,228.0,228.0
mean,19047.94,180582600.0,52239220.0,18210.49,56941400.0,649178000.0
std,134626.3,179887700.0,51017460.0,87298.03,49575610.0,568216100.0
min,0.0,3539.0,5.0,2.0,9.0,7850.0
25%,0.0,5737094.0,208056.8,38.0,479234.2,3898638.0
50%,0.0,275776100.0,33510820.0,38.0,101322800.0,1159833000.0
75%,511.75,334805300.0,104196900.0,3236.25,101322800.0,1159833000.0
max,1741147.0,1406632000.0,104196900.0,1132935.0,101322800.0,1159833000.0


#Imputasi dengan regresi linier

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [26]:
df4=df_fraction.copy()
df4.isnull().sum()

variable
Active Cases       117
Population         112
Total Cases        110
Total Deaths       119
Total Recovered    125
Total Test         124
dtype: int64

In [27]:
missing_columns=['Total Cases','Total Deaths','Total Recovered','Active Cases','Total Test','Population']

In [28]:
def random_imputation(df, feature):

    number_missing = df[feature].isnull().sum()
    observed_values = df.loc[df[feature].notnull(), feature]
    df.loc[df[feature].isnull(), feature + '_imp'] = np.random.choice(observed_values, number_missing, replace = True)
    
    return df

In [29]:
for feature in missing_columns:
    df4[feature + '_imp'] = df4[feature]
    df4 = random_imputation(df4, feature)

In [31]:
deter_data = pd.DataFrame(columns = [name for name in missing_columns])
for feature in missing_columns:
        
    deter_data[feature] = df4[feature + "_imp"]
    parameters = list(set(df4.columns) - set(missing_columns) - {feature + '_imp'})
    model = LinearRegression()
    model.fit(X = df4[parameters], y = df4[feature + '_imp'])
    deter_data.loc[df4[feature].isnull(), feature] = model.predict(df4[parameters])[df4[feature].isnull()]

In [32]:
deter_data=deter_data.reset_index()
deter_data=deter_data.drop(['index'],axis=1)
deter_data

Unnamed: 0,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Test,Population
0,1.041969e+08,1.132935e+06,1.013228e+08,1.741147e+06,1.159833e+09,3.348053e+08
1,4.468278e+07,2.273704e+05,4.415029e+07,3.289245e+05,9.152658e+08,1.406632e+09
2,3.952431e+07,1.190619e+05,1.340538e+07,9.553200e+04,2.714902e+08,6.558452e+07
3,3.854465e+06,3.273343e+04,3.125549e+06,2.148565e+04,1.223324e+08,5.925510e+07
4,3.682458e+07,2.563779e+05,1.864745e+07,1.838098e+05,2.109215e+08,-1.666169e+07
...,...,...,...,...,...,...
223,7.120000e+02,1.300000e+01,1.111658e+06,4.492028e+03,7.393578e+06,3.256867e+07
224,8.452191e+06,-7.923192e+02,2.900000e+01,-7.197333e+04,1.502851e+08,3.655397e+07
225,1.000000e+01,1.935740e+04,9.000000e+00,0.000000e+00,9.310656e+06,6.261610e+05
226,2.119570e+06,2.000000e+00,1.509232e+06,1.468504e+04,1.372419e+07,3.685778e+07


In [33]:
deter_data.isnull().sum()

Total Cases        0
Total Deaths       0
Total Recovered    0
Active Cases       0
Total Test         0
Population         0
dtype: int64

#Imputas dengan KNN

In [34]:
from sklearn.impute import KNNImputer

In [35]:
#mengcopy dataset ke df5 dan mengecek nilai null
df5 = df_fraction.copy()
df5.isnull().sum()

variable
Active Cases       117
Population         112
Total Cases        110
Total Deaths       119
Total Recovered    125
Total Test         124
dtype: int64

In [36]:
#memasukkan df5 kedalam dataframe
df5 = pd.DataFrame(df5)
print("Data sebelum imputasi\n",df5)

#menggunakan KNN untuk mencari missing values
imputer = KNNImputer(n_neighbors=2)
imputasi_knn = imputer.fit_transform(df5)
print("\n\nData setelah imputasi\n",imputasi_knn)

Data sebelum imputasi
 variable  Active Cases    Population  Total Cases  Total Deaths  \
index                                                             
0            1741147.0  3.348053e+08  104196861.0     1132935.0   
1                  NaN  1.406632e+09   44682784.0           NaN   
2              95532.0  6.558452e+07   39524311.0           NaN   
3                  NaN           NaN          NaN           NaN   
4                  NaN           NaN   36824580.0           NaN   
...                ...           ...          ...           ...   
226                NaN           NaN        712.0          13.0   
227                NaN           NaN          NaN           NaN   
228                0.0  6.261610e+05         10.0           NaN   
229                NaN           NaN          NaN           2.0   
230                5.0           NaN          5.0           NaN   

variable  Total Recovered    Total Test  
index                                    
0             1013227

In [37]:
#memasukkan hasil imputasi dari matrix ke dataframe
kolom = ['Total Cases', 'Total Deaths', 'Total Recovered', 'Active Cases','Total Test', 'Population']
imputasi_knn = pd.DataFrame(data = imputasi_knn, columns = kolom)

In [38]:
imputasi_knn

Unnamed: 0,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Test,Population
0,1741147.0,3.348053e+08,104196861.0,1132935.0,101322779.0,1.159833e+09
1,25234.5,1.406632e+09,44682784.0,158627.0,44150289.0,9.152658e+08
2,95532.0,6.558452e+07,39524311.0,27382.0,1974436.0,2.714902e+08
3,182114.5,6.659457e+07,6555257.0,114608.0,1444268.0,1.223324e+08
4,53636.0,5.845721e+07,36824580.0,195502.0,36945583.0,5.933780e+08
...,...,...,...,...,...,...
223,36.5,7.404250e+04,712.0,13.0,5208.0,1.422685e+05
224,0.0,4.142000e+05,378.5,29.0,29.0,1.333635e+05
225,0.0,6.261610e+05,10.0,252.5,9.0,8.544055e+05
226,36.5,7.404250e+04,1057.5,2.0,5208.0,7.529800e+04


In [39]:
df5.tail()

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
226,,,712.0,13.0,,
227,,,,,29.0,
228,0.0,626161.0,10.0,,9.0,
229,,,,2.0,,
230,5.0,,5.0,,,


In [40]:
#mengecek apakah masih ada data yang mengandung missing values
imputasi_knn.isnull().sum()

Total Cases        0
Total Deaths       0
Total Recovered    0
Active Cases       0
Total Test         0
Population         0
dtype: int64