In [1]:
import numpy as np
import pandas as pd
import scipy.stats
from scipy.stats import chi2

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df=pd.read_csv('train.csv')
df1=pd.read_csv('test.csv')
df2 = pd.read_excel("housing.xlsx")

In [4]:
def print_various_data_info(df):
    print("No of columns={}".format(len(df.columns)))
    print("No of rows={}".format(len(df.index)))
    print("No of entries={}".format(len(df.columns)*len(df.index)))
    print("Data Type=\n{}".format(df.dtypes.value_counts()))
    print("Shape of Dataframe={}".format(df.shape))
    print("Not Null values=\n{}".format(df.count()))
    for i in df.columns:
        print(i,df[i].isna().sum())

In [5]:
def first_ten_and_last_ten(df):
    print(df.head(10))
    print(df.tail(10))

In [6]:
def print_numerical_columns(df):
    print(df.select_dtypes(include=['int64','float64']).columns.tolist())

In [7]:
def get_int_float_dtype_null_column_list(df):
    null_counts=df.select_dtypes(['int64','float64']).isnull().sum()
    return null_counts[null_counts > 0]

In [8]:
def fill_int_float_dtype_null_cells_data_with_mean(df):
    #if want to do permanently use inline=True
    df.fillna(df.mean())

In [9]:
def get_object_dtype_column_list(df):
    object_count=df.select_dtypes(include=['object']).columns.tolist()
    return object_count

In [10]:
def get_category_column_list(df):
    dummies= pd.get_dummies(df.ocean_proximity)
    merged=pd.concat([df,dummies],axis=1)
    final=merged.drop(['ocean_proximity','NEAR OCEAN'],axis=1)
    vc = final.select_dtypes(include=['uint8']).columns.tolist()
    categorial_column_list=[]
    for i in vc:
        if final[i].value_counts()[1] <=10:
            categorial_column_list.append(i)
    return categorial_column_list

In [11]:
def apply_category_column_encoding(df,get_category_column_list):
    dummies= pd.get_dummies(df2.ocean_proximity)
    merged=pd.concat([df2,dummies],axis=1)
    final=merged.drop(['ocean_proximity','NEAR OCEAN'],axis=1)
    x=''.join(get_category_column_list(df))
    encod=final.assign(Addtional_Col = final[x].values)
    encod=encod.drop([x],axis=1)
    return encod

In [12]:
def ocean_poximity(df):
    return df2.groupby(['ocean_proximity']).count()

In [13]:
#cleaing dataset
mapping={'yes':1,'no':0}

df['dependency'] =df['dependency'].replace(mapping).astype(np.float64)
df['edjefe'] =df['edjefe'].replace(mapping).astype(np.float64)
df['edjefa'] =df['edjefa'].replace(mapping).astype(np.float64)

In [14]:
heads = df.loc[df['parentesco1'] == 1].copy()

In [15]:
df.update(df[['v2a1','v18q1','rez_esc','meaneduc','SQBmeaned']].fillna(0))

In [16]:
def biased(df):
    target_counts = heads['Target'].value_counts().sort_index()
    if target_counts[1]<(target_counts[2] & target_counts[3] & target_counts[4]):
        print("Dataset is Biased")
    else:
        print("Dataset is Not Biased")
    

In [17]:
def check_poverty(df):
    equal = df.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)
    not_equal = equal[equal != True]
    print('There are {} households where the family members of the house do not have the same poverty level.'.format(len(not_equal)))

In [18]:
def check_head(df): 
    households_head = df.groupby('idhogar')['parentesco1'].sum()
    households_no_head = df.loc[df['idhogar'].isin(households_head[households_head == 0].index), :]
    print('There are {} households without a head.'.format(households_no_head['idhogar'].nunique()))

In [19]:
def set_poverty(df):
    for household in not_equal.index:
        true_target = int(df[(df['idhogar'] == household) & (df['parentesco1'] == 1.0)]['Target'])
        df.loc[df['idhogar'] == household, 'Target'] = true_target

In [20]:
def remove_null_at_target(df):
    df = df.dropna(axis=0, subset=['Target'])

In [21]:
def merge(df,df1):
    result = pd.concat([df, df1])
    return result