In [32]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
import pycountry
import sqlite3

In [1]:
def lower_df(df):
    """
    Convert all string columns in a DataFrame to lowercase,
    except those containing 'Code'.

    Parameters
    ----------
    df : DataFrame
        A pandas DataFrame that contains the data to be processed.

    Returns
    -------
    DataFrame
        The modified DataFrame with string columns in lowercase.

    """
    for col in df.columns:
        if df[col].dtype == 'object' and 'Code' not in col:
            df[col] = df[col].str.lower()
    return df

In [34]:
conn = sqlite3.connect('WDI.db')

In [35]:
df_ed = pd.read_sql_query("SELECT * FROM indicators", conn)

In [36]:
df_main = lower_df(pd.read_csv('/mnt/data/public/wdi/WDIData.csv'))
df_test = pd.merge(df_ed, df_main, how='left', on='Indicator Name')

In [37]:
df1 = df_test.drop(['Indicator Name', 'Country Name'], axis=1)
df = df1.iloc[:, :-1]
df = df.dropna(subset=['Country Code'])
null_counts = df.groupby('Country Code').apply(lambda x: x.isnull().sum())
df_null = null_counts.describe()
df_null = df_null.iloc[:, 2:]

In [38]:
df_melted = pd.melt(df, id_vars=['Country Code', 'Indicator Code'],
                    value_vars=[str(year) for year in range(1960, 2021)],
                    var_name='Year', value_name='Value')

df_pivoted = df_melted.pivot_table(index=['Country Code', 'Year'],
                                   columns='Indicator Code',
                                   values='Value',
                                   aggfunc='first').reset_index()

# Flatten the columns to remove the MultiIndex in columns
df_pivoted.columns = df_pivoted.columns.to_flat_index()

# You might want to rename the columns by joining tuples if they exist
df_pivoted.columns = [' '.join(col).strip() if isinstance(
    col, tuple) else col for col in df_pivoted.columns]

df_2011 = df_pivoted[df_pivoted['Year'] == '2011']

In [39]:
alpha_3_codes = {country.alpha_3 for country in pycountry.countries}

# Filter the DataFrame to keep rows where 'Country Code' is in the alpha_3_codes set
df_2011 = df_2011[df_2011['Country Code'].isin(alpha_3_codes)]

In [40]:
df_cleaned = df_2011.dropna(thresh=75)
df_cleanedd = df_cleaned.dropna(thresh=95, axis=1)
df_country = df_cleanedd.dropna(thresh=17)

In [42]:
df_impute = df_country.copy()
df_impute = df_impute.iloc[:, 2:]
iterative_imputer = IterativeImputer(
    estimator=LinearRegression(), max_iter=10000, random_state=69)

# Perform the imputation
df_imputed = pd.DataFrame(iterative_imputer.fit_transform(
    df_impute), columns=df_impute.columns)
temp = df_country.iloc[:, 0:2].reset_index(drop=True)
df_final = pd.concat([temp, df_imputed], axis=1)


X = df_final.iloc[:, 2:]

In [2]:
def pca_country(X, df_final):
    """
    Perform Principal Component Analysis (PCA) on a dataset.

    Parameters
    ----------
    X : array_like
        A NumPy array or a pandas DataFrame where rows are samples and columns 
        are features.
    df_final : DataFrame
        A pandas DataFrame that might be used for additional
        processing or storing results.

    Returns
    -------
    tuple
        A tuple containing three elements:
        - The transformed dataset in the PCA space.
        - The original DataFrame.
        - The principal components of the PCA transformation.

    """
    pca = PCA()
    X_pca = pca.fit_transform(X)

    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

    index = np.argmax(cumulative_variance >= 0.99) + 1

    pca = PCA(n_components=index)
    pca.fit(X)
    country_pca = pd.DataFrame(pca.fit_transform(X))
    components = pca.components_

    return country_pca, df_final, components

In [47]:
country_pca, df_final, components = pca_country(X, df_final)