#Preparasi Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('/content/covid_worldwide.csv',usecols=['Total Cases','Total Deaths','Total Recovered','Active Cases','Total Test','Population'])
df.head()

Unnamed: 0,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Test,Population
0,104196861,1132935,101322779,1741147,1159832679,334805269
1,44682784,530740,44150289,1755,915265788,1406631776
2,39524311,164233,39264546,95532,271490188,65584518
3,37779833,165711,37398100,216022,122332384,83883596
4,36824580,697074,35919372,208134,63776166,215353593


In [3]:
df.dtypes

Total Cases        object
Total Deaths       object
Total Recovered    object
Active Cases       object
Total Test         object
Population         object
dtype: object

In [4]:
def clean_currency(x):
    """ If the value is a string, then remove currency symbol and delimiters
    otherwise, the value is numeric and can be converted
    """
    if isinstance(x, str):
        return(x.replace(',', ''))
    return(x)

In [5]:
df['Total Cases']=df['Total Cases'].apply(clean_currency).astype('float')
df['Total Deaths']=df['Total Deaths'].apply(clean_currency).astype('float')
df['Total Recovered']=df['Total Recovered'].apply(clean_currency).astype('float')
df['Active Cases']=df['Active Cases'].apply(clean_currency).astype('float')
df['Total Test']=df['Total Test'].apply(clean_currency).astype('float')
df['Population']=df['Population'].apply(clean_currency).astype('float')

In [6]:
df.dtypes

Total Cases        float64
Total Deaths       float64
Total Recovered    float64
Active Cases       float64
Total Test         float64
Population         float64
dtype: object

In [7]:
df.describe()

Unnamed: 0,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Test,Population
count,231.0,225.0,210.0,212.0,213.0,228.0
mean,2923460.0,30057.78,2993905.0,90998.85,32493160.0,28493260.0
std,9479286.0,105380.6,9520209.0,766388.8,117737000.0,102280300.0
min,5.0,1.0,2.0,0.0,7850.0,799.0
25%,24001.0,223.0,21972.5,52.5,347815.0,445081.5
50%,206592.0,2179.0,235145.5,1115.0,2216193.0,5676382.0
75%,1296146.0,14452.0,1465768.0,10228.75,12984260.0,21707590.0
max,104196900.0,1132935.0,101322800.0,10952620.0,1159833000.0,1406632000.0


In [8]:
df.isnull().sum()

Total Cases         0
Total Deaths        6
Total Recovered    21
Active Cases       19
Total Test         18
Population          3
dtype: int64

In [9]:
df = df.reset_index()
df_onecolumn = pd.melt(df, id_vars=['index'])
df_sampled = df_onecolumn.sample(frac=0.50).reset_index(drop=True)
df_fraction = df_sampled.pivot(index='index', columns='variable', values='value')

df_fraction

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,,334805269.0,,1132935.0,101322779.0,
1,1755.0,,,530740.0,,915265788.0
2,,65584518.0,,,,
3,216022.0,83883596.0,37779833.0,,,
4,,,36824580.0,697074.0,35919372.0,63776166.0
...,...,...,...,...,...,...
226,0.0,,712.0,13.0,,
227,,,29.0,,,
228,0.0,626161.0,10.0,,,
229,0.0,,9.0,2.0,7.0,


In [10]:
df_fraction.isnull().sum()

variable
Active Cases       125
Population          95
Total Cases        118
Total Deaths       114
Total Recovered    118
Total Test         131
dtype: int64

#Imputasi dengan mean

In [11]:
df1=df_fraction.copy()
df1.isnull().sum()

variable
Active Cases       125
Population          95
Total Cases        118
Total Deaths       114
Total Recovered    118
Total Test         131
dtype: int64

In [12]:
df1['Total Cases']=df1['Total Cases'].fillna(df1['Total Cases'].mean())
df1['Total Deaths']=df1['Total Deaths'].fillna(df1['Total Deaths'].mean())
df1['Total Recovered']=df1['Total Recovered'].fillna(df1['Total Recovered'].mean())
df1['Active Cases']=df1['Active Cases'].fillna(df1['Active Cases'].mean())
df1['Total Test']=df1['Total Test'].fillna(df1['Total Test'].mean())
df1['Population']=df1['Population'].fillna(df1['Population'].mean())

In [13]:
df1.isnull().sum()

variable
Active Cases       0
Population         0
Total Cases        0
Total Deaths       0
Total Recovered    0
Total Test         0
dtype: int64

In [14]:
df1.describe()

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test
count,227.0,227.0,227.0,227.0,227.0,227.0
mean,28152.333333,19999780.0,2343320.0,32944.84,3201162.0,24019860.0
std,58177.219177,35970060.0,4258341.0,96325.59,7644939.0,69677070.0
min,0.0,1378.0,9.0,1.0,2.0,7850.0
25%,1811.5,2820812.0,213259.0,1592.5,242543.5,3153862.0
50%,28152.333333,19999780.0,2343320.0,32944.84,3201162.0,24019860.0
75%,28152.333333,19999780.0,2343320.0,32944.84,3201162.0,24019860.0
max,429421.0,334805300.0,37779830.0,1132935.0,101322800.0,915265800.0


#imputasi dengan median

In [15]:
df2=df_fraction.copy()
df2.isnull().sum()

variable
Active Cases       125
Population          95
Total Cases        118
Total Deaths       114
Total Recovered    118
Total Test         131
dtype: int64

In [16]:
df2['Total Cases']=df2['Total Cases'].fillna(df2['Total Cases'].median())
df2['Total Deaths']=df2['Total Deaths'].fillna(df2['Total Deaths'].median())
df2['Total Recovered']=df2['Total Recovered'].fillna(df2['Total Recovered'].median())
df2['Active Cases']=df2['Active Cases'].fillna(df2['Active Cases'].median())
df2['Total Test']=df2['Total Test'].fillna(df2['Total Test'].median())
df2['Population']=df2['Population'].fillna(df2['Population'].median())

In [17]:
df2.isnull().sum()

variable
Active Cases       0
Population         0
Total Cases        0
Total Deaths       0
Total Recovered    0
Total Test         0
dtype: int64

In [18]:
df2.describe()

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test
count,227.0,227.0,227.0,227.0,227.0,227.0
mean,13096.257709,13705320.0,1214155.0,17134.58,1633855.0,11207470.0
std,59752.678627,36730640.0,4395048.0,97608.84,7792575.0,70538810.0
min,0.0,1378.0,9.0,1.0,2.0,7850.0
25%,810.5,2820812.0,171112.0,1463.0,186088.0,1818248.0
50%,810.5,4959330.0,171112.0,1463.0,186088.0,1818248.0
75%,810.5,6617619.0,171112.0,1463.0,186088.0,1818248.0
max,429421.0,334805300.0,37779830.0,1132935.0,101322800.0,915265800.0


#imputasi dengan modus

In [19]:
import statistics

In [20]:
df3=df_fraction.copy()
df3.isnull().sum()

variable
Active Cases       125
Population          95
Total Cases        118
Total Deaths       114
Total Recovered    118
Total Test         131
dtype: int64

In [21]:
df3['Total Cases']=df3['Total Cases'].fillna(statistics.mode(df3['Total Cases']))
df3['Total Deaths']=df3['Total Deaths'].fillna(statistics.mode(df3['Total Deaths']))
df3['Total Recovered']=df3['Total Recovered'].fillna(statistics.mode(df3['Total Recovered']))
df3['Active Cases']=df3['Active Cases'].fillna(statistics.mode(df3['Active Cases']))
df3['Total Test']=df3['Total Test'].fillna(statistics.mode(df3['Total Test']))
df3['Population']=df3['Population'].fillna(statistics.mode(df3['Population']))

In [22]:
df3.isnull().sum()

variable
Active Cases         0
Population           0
Total Cases        118
Total Deaths         0
Total Recovered      0
Total Test         131
dtype: int64

In [23]:
df3.describe()

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test
count,227.0,227.0,109.0,227.0,227.0,96.0
mean,12649.947137,151746600.0,2343320.0,16437.02,54207110.0,24019860.0
std,59846.143698,159743400.0,6160029.0,97723.75,49721810.0,107468800.0
min,0.0,1378.0,9.0,1.0,2.0,7850.0
25%,0.0,2820812.0,18491.0,74.0,242543.5,253363.0
50%,0.0,35844910.0,171112.0,74.0,101322800.0,1818248.0
75%,179.0,334805300.0,1477856.0,1462.5,101322800.0,8007675.0
max,429421.0,334805300.0,37779830.0,1132935.0,101322800.0,915265800.0


#Imputasi dengan regresi linier

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [25]:
df4=df_fraction.copy()
df4.isnull().sum()

variable
Active Cases       125
Population          95
Total Cases        118
Total Deaths       114
Total Recovered    118
Total Test         131
dtype: int64

In [26]:
missing_columns=['Total Cases','Total Deaths','Total Recovered','Active Cases','Total Test','Population']

In [27]:
def random_imputation(df, feature):

    number_missing = df[feature].isnull().sum()
    observed_values = df.loc[df[feature].notnull(), feature]
    df.loc[df[feature].isnull(), feature + '_imp'] = np.random.choice(observed_values, number_missing, replace = True)
    
    return df

In [28]:
for feature in missing_columns:
    df4[feature + '_imp'] = df4[feature]
    df4 = random_imputation(df4, feature)

In [29]:
df4.tail()

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test,Total Cases_imp,Total Deaths_imp,Total Recovered_imp,Active Cases_imp,Total Test_imp,Population_imp
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
226,0.0,,712.0,13.0,,,712.0,13.0,2761.0,0.0,496693.0,9326000.0
227,,,29.0,,,,29.0,172.0,7.0,3615.0,624784.0,3496016.0
228,0.0,626161.0,10.0,,,,10.0,1043.0,50440.0,0.0,24976.0,626161.0
229,0.0,,9.0,2.0,7.0,,9.0,2.0,7.0,0.0,176919.0,288023.0
230,5.0,1378.0,,,,,2693458.0,74.0,5532366.0,5.0,1907195.0,1378.0


In [30]:
deter_data = pd.DataFrame(columns = ["Det" + name for name in missing_columns])
for feature in missing_columns:
        
    deter_data["Det" + feature] = df4[feature + "_imp"]
    parameters = list(set(df4.columns) - set(missing_columns) - {feature + '_imp'})
    model = LinearRegression()
    model.fit(X = df4[parameters], y = df4[feature + '_imp'])
    deter_data.loc[df4[feature].isnull(), "Det" + feature] = model.predict(df4[parameters])[df4[feature].isnull()]

In [31]:
deter_data.tail()

Unnamed: 0_level_0,DetTotal Cases,DetTotal Deaths,DetTotal Recovered,DetActive Cases,DetTotal Test,DetPopulation
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
226,712.0,13.0,1727965.0,0.0,23829910.0,18449910.0
227,29.0,-66.837253,1351722.0,16591.656231,24112510.0,18540790.0
228,10.0,-1640.061131,1284153.0,0.0,24692630.0,626161.0
229,9.0,2.0,7.0,0.0,24529040.0,18452240.0
230,2246939.0,15563.043751,1896303.0,5.0,21269470.0,1378.0


In [32]:
deter_data.isnull().sum()

DetTotal Cases        0
DetTotal Deaths       0
DetTotal Recovered    0
DetActive Cases       0
DetTotal Test         0
DetPopulation         0
dtype: int64

#Imputas dengan KNN

In [33]:
from sklearn.impute import KNNImputer

In [34]:
#mengcopy dataset ke df5 dan mengecek nilai null
df5 = df_fraction.copy()
df5.isnull().sum()

variable
Active Cases       125
Population          95
Total Cases        118
Total Deaths       114
Total Recovered    118
Total Test         131
dtype: int64

In [35]:
#memasukkan df5 kedalam dataframe
df5 = pd.DataFrame(df5)
print("Data sebelum imputasi\n",df5)

#menggunakan KNN untuk mencari missing values
imputer = KNNImputer(n_neighbors=2)
imputasi_knn = imputer.fit_transform(df5)
print("\n\nData setelah imputasi\n",imputasi_knn)

Data sebelum imputasi
 variable  Active Cases   Population  Total Cases  Total Deaths  \
index                                                            
0                  NaN  334805269.0          NaN     1132935.0   
1               1755.0          NaN          NaN      530740.0   
2                  NaN   65584518.0          NaN           NaN   
3             216022.0   83883596.0   37779833.0           NaN   
4                  NaN          NaN   36824580.0      697074.0   
...                ...          ...          ...           ...   
226                0.0          NaN        712.0          13.0   
227                NaN          NaN         29.0           NaN   
228                0.0     626161.0         10.0           NaN   
229                0.0          NaN          9.0           2.0   
230                5.0       1378.0          NaN           NaN   

variable  Total Recovered   Total Test  
index                                   
0             101322779.0          N

In [36]:
#memasukkan hasil imputasi dari matrix ke dataframe
kolom = ['Total Cases', 'Total Deaths', 'Total Recovered', 'Active Cases','Total Test', 'Population']
imputasi_knn = pd.DataFrame(data = imputasi_knn, columns = kolom)

In [37]:
imputasi_knn

Unnamed: 0,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Test,Population
0,5679.5,334805269.0,2439855.0,1132935.0,101322779.0,463710137.0
1,1755.0,11436886.5,934272.5,530740.0,310159.5,915265788.0
2,128443.5,65584518.0,3199486.5,15188.0,5046003.5,31712278.0
3,216022.0,83883596.0,37779833.0,1140.0,1205633.0,465256401.0
4,239283.0,133938706.5,36824580.0,697074.0,35919372.0,63776166.0
...,...,...,...,...,...,...
222,0.0,3529489.0,712.0,13.0,12862.0,46256.5
223,0.0,319113.5,29.0,7.5,968.5,8241.0
224,0.0,626161.0,10.0,82.5,5046.5,341900.0
225,0.0,318532.0,9.0,2.0,7.0,49577.5


In [38]:
df5.tail()

variable,Active Cases,Population,Total Cases,Total Deaths,Total Recovered,Total Test
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
226,0.0,,712.0,13.0,,
227,,,29.0,,,
228,0.0,626161.0,10.0,,,
229,0.0,,9.0,2.0,7.0,
230,5.0,1378.0,,,,


In [39]:
#mengecek apakah masih ada data yang mengandung missing values
imputasi_knn.isnull().sum()

Total Cases        0
Total Deaths       0
Total Recovered    0
Active Cases       0
Total Test         0
Population         0
dtype: int64