# Initial Loading

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns                       
import matplotlib.pyplot as plt
%matplotlib inline     
sns.set(color_codes=True)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df_train = pd.read_csv('/content/drive/MyDrive/Homesite Quote Conversion/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Homesite Quote Conversion/test.csv')

# Looking over Dataset

In [None]:
df_train.head()
head_df = df_train.head()

fig, ax = plt.subplots(figsize=(12, 4))  # Adjust the size as needed
ax.axis('off')

table_data = [head_df.columns.values] + head_df.values.tolist()
table = ax.table(cellText=table_data, cellLoc='center', loc='center')

# Modify table appearance
table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1, 1.5)

plt.savefig('head_dataframe.png', bbox_inches='tight')
plt.show()

In [None]:
df_train.shape

(260753, 299)

In [None]:
df_test.shape

(173836, 298)

In [None]:
df_train.columns

Index(['QuoteNumber', 'Original_Quote_Date', 'QuoteConversion_Flag', 'Field6',
       'Field7', 'Field8', 'Field9', 'Field10', 'Field11', 'Field12',
       ...
       'GeographicField59A', 'GeographicField59B', 'GeographicField60A',
       'GeographicField60B', 'GeographicField61A', 'GeographicField61B',
       'GeographicField62A', 'GeographicField62B', 'GeographicField63',
       'GeographicField64'],
      dtype='object', length=299)

In [None]:
df_train.dtypes

QuoteNumber              int64
Original_Quote_Date     object
QuoteConversion_Flag     int64
Field6                  object
Field7                   int64
                         ...  
GeographicField61B       int64
GeographicField62A       int64
GeographicField62B       int64
GeographicField63       object
GeographicField64       object
Length: 299, dtype: object

In [None]:
df_train.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260753 entries, 0 to 260752
Data columns (total 299 columns):
 #    Column                Dtype  
---   ------                -----  
 0    QuoteNumber           int64  
 1    Original_Quote_Date   object 
 2    QuoteConversion_Flag  int64  
 3    Field6                object 
 4    Field7                int64  
 5    Field8                float64
 6    Field9                float64
 7    Field10               object 
 8    Field11               float64
 9    Field12               object 
 10   CoverageField1A       int64  
 11   CoverageField1B       int64  
 12   CoverageField2A       int64  
 13   CoverageField2B       int64  
 14   CoverageField3A       int64  
 15   CoverageField3B       int64  
 16   CoverageField4A       int64  
 17   CoverageField4B       int64  
 18   CoverageField5A       int64  
 19   CoverageField5B       int64  
 20   CoverageField6A       int64  
 21   CoverageField6B       int64  
 22   CoverageField8    

In [None]:
for col in df_train:
    print(type(col))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

# Cleaning Train

In [None]:
df_train.drop('QuoteNumber', axis = 1, inplace = True)

In [None]:
np.where(pd.isnull(df_train)) 

(array([     1,      1,      2, ..., 260751, 260752, 260752]),
 array([124, 160, 124, ..., 160, 124, 160]))

In [None]:
df_train.isnull().sum()

Original_Quote_Date     0
QuoteConversion_Flag    0
Field6                  0
Field7                  0
Field8                  0
                       ..
GeographicField61B      0
GeographicField62A      0
GeographicField62B      0
GeographicField63       0
GeographicField64       0
Length: 298, dtype: int64

In [None]:
count = 0
for value in df_train.isnull().sum():
    if value != 0:
        print("Value is: {:>6} , ".format(value) + "Index is: {}".format(count))
    count = count + 1

Value is:    113 , Index is: 48
Value is: 124208 , Index is: 124
Value is:     81 , Index is: 129
Value is:     63 , Index is: 130
Value is: 200685 , Index is: 160
Value is:     70 , Index is: 163
Value is:     70 , Index is: 165
Value is:    113 , Index is: 167
Value is:   1220 , Index is: 169


In [None]:
count = 0
for col in df_train:
    print("Column is: {:>20} , ".format(col) + "Index is: {}".format(count))
    count = count + 1

Column is:  Original_Quote_Date , Index is: 0
Column is: QuoteConversion_Flag , Index is: 1
Column is:               Field6 , Index is: 2
Column is:               Field7 , Index is: 3
Column is:               Field8 , Index is: 4
Column is:               Field9 , Index is: 5
Column is:              Field10 , Index is: 6
Column is:              Field11 , Index is: 7
Column is:              Field12 , Index is: 8
Column is:      CoverageField1A , Index is: 9
Column is:      CoverageField1B , Index is: 10
Column is:      CoverageField2A , Index is: 11
Column is:      CoverageField2B , Index is: 12
Column is:      CoverageField3A , Index is: 13
Column is:      CoverageField3B , Index is: 14
Column is:      CoverageField4A , Index is: 15
Column is:      CoverageField4B , Index is: 16
Column is:      CoverageField5A , Index is: 17
Column is:      CoverageField5B , Index is: 18
Column is:      CoverageField6A , Index is: 19
Column is:      CoverageField6B , Index is: 20
Column is:       Covera

In [None]:
df_train = df_train.dropna()
df_train.isnull().sum()

Original_Quote_Date     0
QuoteConversion_Flag    0
Field6                  0
Field7                  0
Field8                  0
                       ..
GeographicField61B      0
GeographicField62A      0
GeographicField62B      0
GeographicField63       0
GeographicField64       0
Length: 298, dtype: int64

In [None]:
#Data Cleaned, no missing values in dataset

count = 0
for value in df_train.isnull().sum():
    if value != 0:
        print("Value is: {:>6} , ".format(value) + "Index is: {}".format(count))
    count = count + 1

# Cleaning Test

In [None]:
df_test.drop('QuoteNumber', axis = 1, inplace = True)

In [None]:
np.where(pd.isnull(df_test)) 

(array([     0,      0,      1, ..., 173834, 173835, 173835]),
 array([123, 159, 123, ..., 159, 123, 159]))

In [None]:
df_test.isnull().sum()

Original_Quote_Date    0
Field6                 0
Field7                 0
Field8                 0
Field9                 0
                      ..
GeographicField61B     0
GeographicField62A     0
GeographicField62B     0
GeographicField63      0
GeographicField64      0
Length: 297, dtype: int64

In [None]:
count = 0
for value in df_test.isnull().sum():
    if value != 0:
        print("Value is: {:>6} , ".format(value) + "Index is: {}".format(count))
    count = count + 1

Value is:     69 , Index is: 47
Value is:  82812 , Index is: 123
Value is:     69 , Index is: 128
Value is:     52 , Index is: 129
Value is:      1 , Index is: 130
Value is: 133945 , Index is: 159
Value is:      1 , Index is: 160
Value is:     41 , Index is: 162
Value is:     41 , Index is: 164
Value is:     67 , Index is: 166
Value is:    846 , Index is: 168


In [None]:
count = 0
for col in df_test:
    print("Column is: {:>20} , ".format(col) + "Index is: {}".format(count))
    count = count + 1

Column is:  Original_Quote_Date , Index is: 0
Column is:               Field6 , Index is: 1
Column is:               Field7 , Index is: 2
Column is:               Field8 , Index is: 3
Column is:               Field9 , Index is: 4
Column is:              Field10 , Index is: 5
Column is:              Field11 , Index is: 6
Column is:              Field12 , Index is: 7
Column is:      CoverageField1A , Index is: 8
Column is:      CoverageField1B , Index is: 9
Column is:      CoverageField2A , Index is: 10
Column is:      CoverageField2B , Index is: 11
Column is:      CoverageField3A , Index is: 12
Column is:      CoverageField3B , Index is: 13
Column is:      CoverageField4A , Index is: 14
Column is:      CoverageField4B , Index is: 15
Column is:      CoverageField5A , Index is: 16
Column is:      CoverageField5B , Index is: 17
Column is:      CoverageField6A , Index is: 18
Column is:      CoverageField6B , Index is: 19
Column is:       CoverageField8 , Index is: 20
Column is:       Covera

In [None]:
df_test = df_test.dropna()
df_test.isnull().sum()

Original_Quote_Date    0
Field6                 0
Field7                 0
Field8                 0
Field9                 0
                      ..
GeographicField61B     0
GeographicField62A     0
GeographicField62B     0
GeographicField63      0
GeographicField64      0
Length: 297, dtype: int64

In [None]:
#Data Cleaned, no missing values in dataset

#Checks to see if any missing values ; If nothing prints then there is no NaN values.
count = 0
for value in df_test.isnull().sum():
    if value != 0:
        print("Value is: {:>6} , ".format(value) + "Index is: {}".format(count))
    count = count + 1

In [None]:
# Uploading Dataframe as CSV to use 

df_train.to_csv('hsq_df_train.csv', index = False)
!cp hsq_df_train.csv "drive/My Drive/"

df_test.to_csv('hsq_df_test.csv', index = False)
!cp hsq_df_test.csv "drive/My Drive/"