In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
pd.options.display.max_columns = 500

In [2]:
#from google.colab import drive # We mount a Google Drive folder where the dataset is.
#drive.mount('/content/drive', force_remount=True)

#path = "/content/drive" + "/MyDrive" + "/Cursos/Coderhouse/Data Science/"
path = "./"

In [3]:
from pandas.api.types import is_numeric_dtype
from scipy.stats import ttest_ind
import sidetable

In [4]:
def univariate_info(new_df):
  df_info = pd.DataFrame(columns=['Count', 'Type' , 'Missing', 'Unique', 'Numeric'])
  for col in new_df:
    data_series = new_df[col]
    df_info.loc[col] = [data_series.count(), data_series.dtype, data_series.isnull().sum(), data_series.nunique(), is_numeric_dtype(data_series)]
    df_describe = new_df.describe(include='all').T[['top', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]
    df_stats = pd.DataFrame([new_df.skew(numeric_only=True), new_df.kurtosis(numeric_only=True)], index=['skew', 'kurtosis']).T
  return pd.concat([df_info,pd.concat([df_describe, df_stats], axis=1)], axis=1).fillna('0')

def corrFilter(x: pd.DataFrame, thres: float):
    #generate corr 
    xCorr = x.corr('spearman')
    #filter corr by thres
    xFiltered = xCorr[((xCorr >= thres) | (xCorr <= -thres)) & (xCorr !=1.000)]
    #change dataframe format
    xFlattened = xFiltered.unstack().drop_duplicates().reset_index().sort_values(0, ascending= False).dropna()
    #rename columns
    xFlattened.columns = ['Variable_1', 'Variable_2', 'corr_value']
    return xFlattened

In [5]:
def strat_sample(df, target, sample_size, seed): 
    '''We make a stratified sample by a binary target. The target values are 0 and 1, 
                                                #for simplicity.
                                                
        df is a dataframe
        
        target is the name of the target variable
        
        seed is the random seed to generate pseudo random numbers.'''
    
    x=df[target].value_counts(normalize=True)*sample_size # Calculate the rate of each target value and then we multiply by
        # sample size. If we around this numbers we obtain the desired number of samples for each posible value of the target
        # variable. These only works because the target is binary. If the target is not binary, I have to think of a way to
        # round all the rows and have the sum of exactly the number of samples desired
   
    x=round(x,0)
    size_0 =x.iloc[0]
    size_0=size_0.astype(int)
    size_1 = x.loc[1]
    size_1=size_1.astype(int)
    
    part_0 = df[df[target]==0].sample(size_0, random_state= seed) # We take a sample for each value of the sample of size_i.
    part_1 = df[df[target]==1].sample(size_1, random_state= seed)
    sample = pd.concat([part_0, part_1])
    
    return sample    

In [6]:
def describe_values(df, category_col, column_to_analyze):
  '''It does the same as df.groupby(category_col)[column_to_analyze].describe() but with more information. But it has 
  the problem that it couldn't handle null values. We have to see how to fix it later.'''

  #create a dataframe with specific columns

  df_info = pd.DataFrame(columns=['count', 'missing', 'unique_values', 'mean', 'std', 'mode', 'min', '25%', '50%', '75%', 'max', 'skew', 'kurtosis'])
  df_info.index.name = category_col # The name in the index appears in the name of the first column
  #loop of all the values that the category has
  for val in df[category_col].unique():

      # get info from column
      data_series = df[df[category_col]==val][column_to_analyze]
      # fill dataframe with initial columns
      df_info.loc[val] = [data_series.count(), data_series.isnull().sum(),  data_series.nunique(), data_series.mean(), data_series.std(), data_series.mode().iloc[0], data_series.min(), data_series.quantile(.25), data_series.quantile(.5), data_series.quantile(.75), data_series.max(), data_series.skew(), data_series.kurtosis()]

  return df_info

In [8]:
def partition_log(a,b,n):
    '''Given an integer interval [a,b](={a, a+1, ..., b}) returns a partition {a=a_0, a_1, a_2, ..., a_n=b} "logarithmically 
    equispaced in integer numbers". By this we mean that ln (a_{i+1}-a_i +1) is constant, or equivalently,
    (a_{i+1}-a_0+1)/((a_{i}-a_0+1)) is constant. (To understand why 1 is added, think about the interval [0,1] and n=1. You have
    #to transform the interval to [1,2]).'''
    
    import numpy as np
    
    s=np.log(b-a+1)/n
    l=np.array([])
    for j in range(0,n+1):
        l=np.append(l, np.exp(j*s)-1+a)
    
    return l

In [9]:
#!pip install mlxtend 
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [10]:
styles=[dict(selector="caption", props=[("font-size", "100%"), ("font-weight", "bold")
             #            ,("color", "white"), ("background-color", "grey")])] # Another option                                       
                                       ])]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer 
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer # Code to use feature cat and numeric together
from sklearn.compose import make_column_selector # Selector in pipeline by dtypes

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import balanced_accuracy_score as bas
import time
from xgboost import XGBClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV
from scipy.stats import randint