In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import plotly.graph_objects as go
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, pairwise_distances_chunked, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [None]:
df = pd.read_csv("covid.csv")
df.head()

In [None]:
df["date"] = pd.to_datetime(df["date"])

In [None]:
df = df.dropna(subset=["new_cases"], axis=0)
df["new_cases"].isna().sum()

In [None]:
df = df.drop(["iso_code"], axis=1)

In [None]:
temp = df.sort_values(["date"], ascending=True).groupby(["location"])

In [None]:
def calculate_rt(g_arr):
    x, y = sum(g_arr[0:4]), sum(g_arr[4:8])
    if x == 0 or y == 0:
        return 0
    return np.round(x/y, 2)

group_arr = []
for name, group in temp:
    temp_group = group.reset_index()
    rt = np.array([])
    for i, item in temp_group.iterrows():
        if i < 8:
            rt = np.append(rt, 0)
            continue
        try:
            cases_arr = [temp_group.iloc[x]["new_cases"] for x in range(i, i-8, -1)]
        except IndexError:
            rt = np.append(rt, 0)
            break
        rt = np.append(rt, calculate_rt(cases_arr))
    temp_group["rt"] = rt
    group_arr.append(temp_group)
newdf = pd.concat(group_arr)
newdf.head()

In [None]:
newdf = newdf.drop(["index"], axis=1)

In [None]:
fig_dims = (20, 5)
fig, ax = plt.subplots(figsize=fig_dims)
sns.lineplot(x=newdf[newdf["location"] == "France"]["date"], y=newdf[newdf["location"] == "France"]["rt"], ax=ax)

In [None]:
for i in newdf.columns:
    print(f"{i} nan count - ", newdf[i].isna().sum())

Deleting all NA columns where NA is 70%

In [None]:
newdf = newdf.dropna(thresh=len(newdf)*.7, axis=1)
for i in newdf.columns:
    print(f"{i} nan count - ", newdf[i].isna().sum())

In [None]:
newdf[newdf.total_cases.isna()]

NA replacement

In [None]:
newdf["total_cases"] = newdf["total_cases"].fillna(0)
newdf[newdf.total_cases.isna()]

In [None]:
for i in newdf.columns:
    if i == "continent":
        continue
    if newdf[i].isna().sum() > 0:
        newdf[i] = newdf[i].fillna(method='ffill').fillna(method='bfill')
newdf['continent'].fillna("No", inplace = True)

In [None]:
for i in newdf.columns:
    print(f"{i} nan count - ", newdf[i].isna().sum())

In [None]:
newdf.head()

In [None]:
le = LabelEncoder()
newdf["continent"] = le.fit_transform(newdf["continent"])
newdf.head()

In [None]:
le2 = LabelEncoder()
newdf["location"] = le2.fit_transform(newdf["location"])

In [None]:
newdf.head()

In [None]:
train_df = newdf.iloc[:, newdf.columns != "date"]

In [None]:
tsne = TSNE()
d2ata = tsne.fit_transform(train_df)

fig_dims = (10, 10)
fig, ax = plt.subplots(figsize=fig_dims)

sns.scatterplot(x=d2ata[:, 0], y=d2ata[:, 1], ax=ax)

In [None]:
tsne = TSNE(n_jobs=-1, n_components=3)
d3ata = tsne.fit_transform(train_df)
marker_data = go.Scatter3d(
    x=d3ata[:, 0], 
    y=d3ata[:, 1], 
    z=d3ata[:, 2], 
    marker=go.scatter3d.Marker(size=3), 
    opacity=0.8, 
    mode='markers'
)
fig=go.Figure(data=marker_data)
fig.show()

In [None]:
kmeans = MiniBatchKMeans(3, random_state=42, max_iter=250)
labels = kmeans.fit_predict(d3ata)

3D Plotting

In [None]:
marker_data = go.Scatter3d(
    x=d3ata[:, 0], 
    y=d3ata[:, 1], 
    z=d3ata[:, 2], 
#     marker=go.scatter3d.Marker(size=3), 
    opacity=0.8, 
    mode='markers',
    marker={
        "color": labels,
        "size": 3
    }
)
fig=go.Figure(data=marker_data)
fig.show()

In [None]:
kmeans2 = MiniBatchKMeans(6, random_state=42, max_iter=250)
labels2 = kmeans2.fit_predict(d2ata)

Scatterplot

In [None]:
fig_dims = (10, 10)
fig, ax = plt.subplots(figsize=fig_dims)

sns.scatterplot(x=d2ata[:, 0], y=d2ata[:, 1], ax=ax, c=labels2)

In [None]:
kmeans3 = MiniBatchKMeans(6, random_state=42, max_iter=250)
labels3 = kmeans3.fit_predict(train_df)

In [None]:
fig_dims = (10, 10)
fig, ax = plt.subplots(figsize=fig_dims)

sns.scatterplot(x=d2ata[:, 0], y=d2ata[:, 1], ax=ax, c=labels3)

Plotting to get best cluster number

In [None]:
inertia = {}
for i in range(2, 9):
    kmeansX = MiniBatchKMeans(i, random_state=42, max_iter=250)
    labelsX = kmeansX.fit_predict(train_df)
    inertia[i] = kmeansX.inertia_
sns.lineplot(x=list(inertia.keys()), y=list(inertia.values()))

In [None]:
inertia = {}
for i in range(2, 9):
    kmeansX = MiniBatchKMeans(i, random_state=42, max_iter=250)
    labelsX = kmeansX.fit_predict(d2ata)
    inertia[i] = kmeansX.inertia_
sns.lineplot(x=list(inertia.keys()), y=list(inertia.values()))

In [None]:
inertia = {}
for i in range(2, 9):
    kmeansX = MiniBatchKMeans(i, random_state=42, max_iter=250)
    labelsX = kmeansX.fit_predict(d3ata)
    inertia[i] = kmeansX.inertia_
sns.lineplot(x=list(inertia.keys()), y=list(inertia.values()))

In [None]:
kmeans4 = MiniBatchKMeans(3, random_state=42, max_iter=250)
labels4 = kmeans4.fit_predict(train_df)

In [None]:
fig_dims = (10, 10)
fig, ax = plt.subplots(figsize=fig_dims)

sns.scatterplot(x=d2ata[:, 0], y=d2ata[:, 1], ax=ax, c=labels4)

In [None]:
marker_data = go.Scatter3d(
    x=d3ata[:, 0], 
    y=d3ata[:, 1], 
    z=d3ata[:, 2], 
#     marker=go.scatter3d.Marker(size=3), 
    opacity=0.8, 
    mode='markers',
    marker={
        "color": labels4,
        "size": 3
    }
)
fig=go.Figure(data=marker_data)
fig.show()

In [None]:
kmeans5 = MiniBatchKMeans(3, random_state=42, max_iter=250)
labels5 = kmeans5.fit_predict(d2ata)

In [None]:
fig_dims = (15, 15)
fig, ax = plt.subplots(figsize=fig_dims)

sns.scatterplot(x=d2ata[:, 0], y=d2ata[:, 1], ax=ax, c=labels5)

In [None]:
newdf.columns

In [None]:
newdf["label"] = labels5
newdf

In [None]:
x = df[df["location"] == "France"]#["label"].value_counts()
x[x["label"] == 1]
# sns.barplot(x=[1, 2, 3], y)

In [None]:
last_day = df[df["location"] == "France"].iloc[-1]

In [None]:
last_day