In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Data Access

In [16]:
def data_load() -> list:
    """Load csv files to DataFrame"""
    
    train_data = pd.read_csv("project_data/train_data.csv", header=None)
    test_data = pd.read_csv("project_data/test_data.csv", header=None)
    train_labels = pd.read_csv("project_data/train_labels.csv", header=None)
    
    return [train_data, test_data, train_labels]

In [17]:
train_data, test_data, train_labels = data_load()

In [18]:
data = [train_data, test_data, train_labels]
names = ["train_data", "test_data", "train_labels"]

# EDA

In [19]:
def data_info(dfs: list, cols: list) -> None: 
    """Check dataframes with .info()"""
    
    for d, n in zip(dfs, cols):
        print(f"""
####################################
\ninfo for {n}\n
####################################
        """)
        d.info()

In [20]:
data_info(data, names) # problem: multidimensional dataset, solution: dimensional reduction (Correlation Heatmap, Univariate Selection/ PCA - Principal Component Analysis/ RFE - Recursive Feature Elimination)


####################################

info for train_data

####################################
        
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3750 entries, 0 to 3749
Columns: 10000 entries, 0 to 9999
dtypes: float64(10000)
memory usage: 286.1 MB

####################################

info for test_data

####################################
        
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Columns: 10000 entries, 0 to 9999
dtypes: float64(10000)
memory usage: 95.4 MB

####################################

info for train_labels

####################################
        
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3750 entries, 0 to 3749
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       3750 non-null   int64
dtypes: int64(1)
memory usage: 29.4 KB


In [21]:
def data_describe(dfs:list, cols:list) -> None:
    """Check dataframes with .describe()"""
    
    for d, n in zip(dfs, cols):
        print(f"""
####################################
\ndescribe for {n}\n
####################################
        """)
        print(d.describe())

In [22]:
data_describe(data, names) # problem: (binary) classsification, solution: relevant ml models (SGDClassifier, LogisticRegression, LinearSVC)


####################################

describe for train_data

####################################
        
              0            1              2              3             4     \
count  3750.000000  3750.000000    3750.000000    3750.000000   3750.000000   
mean     30.644135     9.080955     169.679419     811.083532   -186.008805   
std     286.172414   213.166659   43665.003613   57123.457110  21942.466227   
min    -989.450000  -859.922000 -163336.428000 -243844.632000 -77566.109000   
25%    -163.781750  -135.343500  -29284.119500  -38028.429000 -14426.799500   
50%      35.478500     2.680500    1181.698500     980.588000   -154.835000   
75%     215.780500   156.095750   29630.914500   40561.960250  14515.833000   
max    1224.394000   722.868000  177744.241000  215372.854000  86287.955000   

              5            6            7            8            9     ...  \
count  3750.000000  3750.000000  3750.000000  3750.000000  3750.000000  ...   
mean     25.717789  

In [23]:
def data_isnull(dfs:list, cols:list) -> None:
    """Check dataframes for null values"""
    
    for d, n in zip(dfs, cols):
        print(f"""
#############################################################
\ndescending percentage of null values in columns of {n}\n
#############################################################
        """)
        print(round(d.isnull().sum().sort_values(ascending=False)/len(d)*100,2))

In [24]:
data_isnull(data, names) # no null values


#############################################################

descending percentage of null values in columns of train_data

#############################################################
        
0       0.0
6670    0.0
6663    0.0
6664    0.0
6665    0.0
       ... 
3333    0.0
3334    0.0
3335    0.0
3336    0.0
9999    0.0
Length: 10000, dtype: float64

#############################################################

descending percentage of null values in columns of test_data

#############################################################
        
0       0.0
6670    0.0
6663    0.0
6664    0.0
6665    0.0
       ... 
3333    0.0
3334    0.0
3335    0.0
3336    0.0
9999    0.0
Length: 10000, dtype: float64

#############################################################

descending percentage of null values in columns of train_labels

#############################################################
        
0    0.0
dtype: float64


In [25]:
label_data = [train_labels]
label_names = ["train_labels"]

In [26]:
def data_count(df_y:list, col:list) -> None:
    """Check labels for balanced/imbalanced dataset"""
    
    for d, n in zip(df_y, col):
        print(f"""
#############################################################
\nvalues counts of {n}\n
#############################################################
        """)
        print(d.value_counts())

In [27]:
data_count(label_data, label_names) # problem: unbalanced dataset, solution: Random Undersampling, Oversampling/SMOTE



#############################################################

values counts of train_labels

#############################################################
        
 1    3375
-1     375
dtype: int64
