In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
train = pd.read_csv('train_LP.csv')
test = pd.read_csv('test_LP.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'train_LP.csv'

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
train.shape

In [None]:
train.isnull().sum()

In [None]:
train['LoanAmount'] = train['LoanAmount'].fillna(train['LoanAmount'].mean())
train['Loan_Amount_Term'] = train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mean())
train['Credit_History'] = train['Credit_History'].fillna(train['Credit_History'].mean())

In [None]:
train['Gender'] = train['Gender'].fillna(train['Gender'].mode()[0])
train['Married'] = train['Married'].fillna(train['Married'].mode()[0])
train['Dependents'] = train['Dependents'].fillna(train['Dependents'].mode()[0])
train['Self_Employed'] = train['Self_Employed'].fillna(train['Self_Employed'].mode()[0])

In [None]:
train.head(2)

## Exploratory Data Analysis

In [None]:
train.columns

In [None]:
cat_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
target = ['Loan_Status']

### Univariate Analysis

In [None]:
for col in cat_cols:
    fig = plt.figure(figsize=(10, 5))
    ax = fig.add_subplot(111)
    sns.countplot(x = col, data = train, ax = ax)
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')
    ax.set_title(f'Frequency Distribution for individual class in {col}')

In [None]:
sns.countplot(train['Loan_Status'])

In [None]:
for col in num_cols:
    fig = plt.figure(figsize=(10,5))
    ax = fig.add_subplot(111)
    ax = sns.distplot(train[col], color='m', label=train[col].skew())
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')
    ax.set_title(f'Frequency Distribution for individual class in {col}')
    ax.legend()

### Bivariate Analysis

In [None]:
sns.pairplot(train[num_cols])

In [None]:
for cat_col in cat_cols:
    fig = plt.figure(figsize = (30,5))
    j = 1
    for num_col in num_cols:
        ax = fig.add_subplot(1, len(num_cols), j)
        sns.boxplot( x = cat_col, y = num_col, data = train, ax = ax)
        ax.set_xlabel(cat_col)
        ax.set_ylabel(num_col)
        ax.set_title(f'Distribution of {num_col} with respect to {cat_col}')
        j = j+1

In [None]:
corr = train.corr()
plt.figure(figsize=(14, 7))
sns.heatmap(corr, annot=True, cmap='coolwarm')

### outlier handling

In [None]:
def handle_outliers(df, var):
    var_data = df[var].values
    q25 = np.percentile(var_data, 25)
    q75 = np.percentile(var_data, 75)
    
    iqr = q75 - q25
    
    lower = q25 - (iqr*1.5)
    upper = q75 + (iqr*1.5)
    
    outliers = [x for x in var_data if x < lower or x > upper]
    
    print(f'Number of Outliers in {var} = {len(outliers)}')
    
    return list(df[(df[var] > upper) | (df[var] < lower)].index)

In [None]:
train_data = train.copy()

In [None]:
train.shape

In [None]:
train_data.shape

In [None]:
outliers = []
for num_col in num_cols:
    outliers.extend(handle_outliers(train_data, num_col))

outliers = list(set(outliers))

In [None]:
train_data = train_data.drop(outliers)

In [None]:
train_data.shape

### Catorigal Encoding

In [None]:
train_data[cat_cols + num_cols].head()

In [None]:
pd.get_dummies(train_data[cat_cols + num_cols], columns= cat_cols, drop_first= True)

In [None]:
train_data_dm = pd.get_dummies(train_data[cat_cols + num_cols + target], columns= cat_cols, drop_first =True)

In [None]:
train_data_dm.head()

### Data Split

In [None]:
from sklearn.model_selection import train_test_split
train_data_train, test_data_test = train_test_split(train_data_dm, test_size=0.2, random_state=102)

In [None]:
train_data_train.shape

In [None]:
test_data_test.shape

In [None]:
train_data_dm.shape

In [None]:
280+70

In [None]:
x_train = train_data_train.drop(target, axis=1)
y_train = train_data_train[target]

In [None]:
x_test = test_data_test.drop(target, axis=1)
y_test = test_data_test[target]

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
x_test.shape

In [None]:
y_test.shape

### Categorical encoding - Target (Label) Variable

In [None]:
# Lebel Encoding
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
le.fit(y_train)

In [None]:
le.classes_

In [None]:
y_train = le.transform(y_train)

In [None]:
y_test = le.transform(y_test)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)
y_pred = lr_model.predict(x_test)

In [None]:
y_test

In [None]:
y_pred

In [None]:
len(y_test)

In [None]:
len(y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
cm