### Problem statement
Building a logistic regression model to assign a lead score between 0 and 100 to each of the leads, which can be used by the company to target potential leads

In [610]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [611]:
data = pd.read_csv('/Users/mrbinit/Downloads/Leads.csv')

In [612]:
data.shape

(9240, 37)

### Understanding missing value

In [613]:
missing_value = data.isnull().sum()
print(missing_value)

Prospect ID                                         0
Lead Number                                         0
Lead Origin                                         0
Lead Source                                        36
Do Not Email                                        0
Do Not Call                                         0
Converted                                           0
TotalVisits                                       137
Total Time Spent on Website                         0
Page Views Per Visit                              137
Last Activity                                     103
Country                                          2461
Specialization                                   1438
How did you hear about X Education               2207
What is your current occupation                  2690
What matters most to you in choosing a course    2709
Search                                              0
Magazine                                            0
Newspaper Article           

### Changing columns name


In [614]:
data.columns

Index(['Prospect ID', 'Lead Number', 'Lead Origin', 'Lead Source',
       'Do Not Email', 'Do Not Call', 'Converted', 'TotalVisits',
       'Total Time Spent on Website', 'Page Views Per Visit', 'Last Activity',
       'Country', 'Specialization', 'How did you hear about X Education',
       'What is your current occupation',
       'What matters most to you in choosing a course', 'Search', 'Magazine',
       'Newspaper Article', 'X Education Forums', 'Newspaper',
       'Digital Advertisement', 'Through Recommendations',
       'Receive More Updates About Our Courses', 'Tags', 'Lead Quality',
       'Update me on Supply Chain Content', 'Get updates on DM Content',
       'Lead Profile', 'City', 'Asymmetrique Activity Index',
       'Asymmetrique Profile Index', 'Asymmetrique Activity Score',
       'Asymmetrique Profile Score',
       'I agree to pay the amount through cheque',
       'A free copy of Mastering The Interview', 'Last Notable Activity'],
      dtype='object')

In [615]:
# Dictionary to map old column names to new column names
column_mapping = {
    'Prospect ID': 'Prospect_ID',
    'Lead Number': 'Lead_Number',
    'Lead Origin': 'Lead_Origin',
    'Lead Source': 'Lead_Source',
    'Do Not Email': 'Do_Not_Email',
    'Do Not Call': 'Do_Not_Call',
    'Converted': 'Converted',
    'TotalVisits': 'Total_Visits',
    'Total Time Spent on Website': 'Total_Time_Spent_on_Website',
    'Page Views Per Visit': 'Page_Views_Per_Visit',
    'Last Activity': 'Last_Activity',
    'Country': 'Country',
    'Specialization': 'Specialization',
    'How did you hear about X Education': 'How_did_you_hear_about_X_Education',
    'What is your current occupation': 'Current_Occupation',
    'What matters most to you in choosing a course': 'Matters_Most_Choosing_Course',
    'Search': 'Search',
    'Magazine': 'Magazine',
    'Newspaper Article': 'Newspaper_Article',
    'X Education Forums': 'X_Education_Forums',
    'Newspaper': 'Newspaper',
    'Digital Advertisement': 'Digital_Advertisement',
    'Through Recommendations': 'Through_Recommendations',
    'Receive More Updates About Our Courses': 'Receive_More_Updates',
    'Tags': 'Tags',
    'Lead Quality': 'Lead_Quality',
    'Update me on Supply Chain Content': 'Update_on_Supply_Chain_Content',
    'Get updates on DM Content': 'Update_on_DM_Content',
    'Lead Profile': 'Lead_Profile',
    'City': 'City',
    'Asymmetrique Activity Index': 'Asymmetrique_Activity_Index',
    'Asymmetrique Profile Index': 'Asymmetrique_Profile_Index',
    'Asymmetrique Activity Score': 'Asymmetrique_Activity_Score',
    'Asymmetrique Profile Score': 'Asymmetrique_Profile_Score',
    'I agree to pay the amount through cheque': 'Agree_to_Pay_Cheque',
    'A free copy of Mastering The Interview': 'Free_Copy_Mastering_The_Interview',
    'Last Notable Activity': 'Last_Notable_Activity',
}

# Rename the columns using the mapping
data.rename(columns=column_mapping, inplace=True)

In [616]:
data.columns

Index(['Prospect_ID', 'Lead_Number', 'Lead_Origin', 'Lead_Source',
       'Do_Not_Email', 'Do_Not_Call', 'Converted', 'Total_Visits',
       'Total_Time_Spent_on_Website', 'Page_Views_Per_Visit', 'Last_Activity',
       'Country', 'Specialization', 'How_did_you_hear_about_X_Education',
       'Current_Occupation', 'Matters_Most_Choosing_Course', 'Search',
       'Magazine', 'Newspaper_Article', 'X_Education_Forums', 'Newspaper',
       'Digital_Advertisement', 'Through_Recommendations',
       'Receive_More_Updates', 'Tags', 'Lead_Quality',
       'Update_on_Supply_Chain_Content', 'Update_on_DM_Content',
       'Lead_Profile', 'City', 'Asymmetrique_Activity_Index',
       'Asymmetrique_Profile_Index', 'Asymmetrique_Activity_Score',
       'Asymmetrique_Profile_Score', 'Agree_to_Pay_Cheque',
       'Free_Copy_Mastering_The_Interview', 'Last_Notable_Activity'],
      dtype='object')

In [617]:
data.City.isnull().sum()

1420

In [618]:
data.head(10)

Unnamed: 0,Prospect_ID,Lead_Number,Lead_Origin,Lead_Source,Do_Not_Email,Do_Not_Call,Converted,Total_Visits,Total_Time_Spent_on_Website,Page_Views_Per_Visit,...,Update_on_DM_Content,Lead_Profile,City,Asymmetrique_Activity_Index,Asymmetrique_Profile_Index,Asymmetrique_Activity_Score,Asymmetrique_Profile_Score,Agree_to_Pay_Cheque,Free_Copy_Mastering_The_Interview,Last_Notable_Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified
5,2058ef08-2858-443e-a01f-a9237db2f5ce,660680,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,,,01.High,02.Medium,17.0,15.0,No,No,Modified
6,9fae7df4-169d-489b-afe4-0f3d752542ed,660673,Landing Page Submission,Google,No,No,1,2.0,1640,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,No,Modified
7,20ef72a2-fb3b-45e0-924e-551c5fa59095,660664,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,,,02.Medium,02.Medium,15.0,15.0,No,No,Modified
8,cfa0128c-a0da-4656-9d47-0aa4e67bf690,660624,Landing Page Submission,Direct Traffic,No,No,0,2.0,71,2.0,...,No,,Thane & Outskirts,02.Medium,02.Medium,14.0,14.0,No,Yes,Email Opened
9,af465dfc-7204-4130-9e05-33231863c4b5,660616,API,Google,No,No,0,4.0,58,4.0,...,No,,Mumbai,02.Medium,02.Medium,13.0,16.0,No,No,Email Opened


### finding columns with select


In [619]:
data.columns[data.isin(['Select']). any()]

Index(['Specialization', 'How_did_you_hear_about_X_Education', 'Lead_Profile',
       'City'],
      dtype='object')

### Replacing select with nan

In [620]:
# Replace selected values with null values
data = data.replace("Select", np.nan)


In [621]:
data.columns[data.isin(['Select']). any()]

Index([], dtype='object')

In [622]:
data.head(10)

Unnamed: 0,Prospect_ID,Lead_Number,Lead_Origin,Lead_Source,Do_Not_Email,Do_Not_Call,Converted,Total_Visits,Total_Time_Spent_on_Website,Page_Views_Per_Visit,...,Update_on_DM_Content,Lead_Profile,City,Asymmetrique_Activity_Index,Asymmetrique_Profile_Index,Asymmetrique_Activity_Score,Asymmetrique_Profile_Score,Agree_to_Pay_Cheque,Free_Copy_Mastering_The_Interview,Last_Notable_Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,,,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,,,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified
5,2058ef08-2858-443e-a01f-a9237db2f5ce,660680,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,,,01.High,02.Medium,17.0,15.0,No,No,Modified
6,9fae7df4-169d-489b-afe4-0f3d752542ed,660673,Landing Page Submission,Google,No,No,1,2.0,1640,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,No,Modified
7,20ef72a2-fb3b-45e0-924e-551c5fa59095,660664,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,,,02.Medium,02.Medium,15.0,15.0,No,No,Modified
8,cfa0128c-a0da-4656-9d47-0aa4e67bf690,660624,Landing Page Submission,Direct Traffic,No,No,0,2.0,71,2.0,...,No,,Thane & Outskirts,02.Medium,02.Medium,14.0,14.0,No,Yes,Email Opened
9,af465dfc-7204-4130-9e05-33231863c4b5,660616,API,Google,No,No,0,4.0,58,4.0,...,No,,Mumbai,02.Medium,02.Medium,13.0,16.0,No,No,Email Opened


In [623]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Prospect_ID                         9240 non-null   object 
 1   Lead_Number                         9240 non-null   int64  
 2   Lead_Origin                         9240 non-null   object 
 3   Lead_Source                         9204 non-null   object 
 4   Do_Not_Email                        9240 non-null   object 
 5   Do_Not_Call                         9240 non-null   object 
 6   Converted                           9240 non-null   int64  
 7   Total_Visits                        9103 non-null   float64
 8   Total_Time_Spent_on_Website         9240 non-null   int64  
 9   Page_Views_Per_Visit                9103 non-null   float64
 10  Last_Activity                       9137 non-null   object 
 11  Country                             6779 no

### Finding categorical variables

In [624]:
categorical = [var for var in data.columns if data[var].dtype=='O'] #dtype means object

print('The total categorical variables are {}'.format(len(categorical)))
print(categorical)


The total categorical variables are 30
['Prospect_ID', 'Lead_Origin', 'Lead_Source', 'Do_Not_Email', 'Do_Not_Call', 'Last_Activity', 'Country', 'Specialization', 'How_did_you_hear_about_X_Education', 'Current_Occupation', 'Matters_Most_Choosing_Course', 'Search', 'Magazine', 'Newspaper_Article', 'X_Education_Forums', 'Newspaper', 'Digital_Advertisement', 'Through_Recommendations', 'Receive_More_Updates', 'Tags', 'Lead_Quality', 'Update_on_Supply_Chain_Content', 'Update_on_DM_Content', 'Lead_Profile', 'City', 'Asymmetrique_Activity_Index', 'Asymmetrique_Profile_Index', 'Agree_to_Pay_Cheque', 'Free_Copy_Mastering_The_Interview', 'Last_Notable_Activity']


In [625]:
data[categorical].head()

Unnamed: 0,Prospect_ID,Lead_Origin,Lead_Source,Do_Not_Email,Do_Not_Call,Last_Activity,Country,Specialization,How_did_you_hear_about_X_Education,Current_Occupation,...,Lead_Quality,Update_on_Supply_Chain_Content,Update_on_DM_Content,Lead_Profile,City,Asymmetrique_Activity_Index,Asymmetrique_Profile_Index,Agree_to_Pay_Cheque,Free_Copy_Mastering_The_Interview,Last_Notable_Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,API,Olark Chat,No,No,Page Visited on Website,,,,Unemployed,...,Low in Relevance,No,No,,,02.Medium,02.Medium,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,API,Organic Search,No,No,Email Opened,India,,,Unemployed,...,,No,No,,,02.Medium,02.Medium,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,Landing Page Submission,Direct Traffic,No,No,Email Opened,India,Business Administration,,Student,...,Might be,No,No,Potential Lead,Mumbai,02.Medium,01.High,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,Landing Page Submission,Direct Traffic,No,No,Unreachable,India,Media and Advertising,Word Of Mouth,Unemployed,...,Not Sure,No,No,,Mumbai,02.Medium,01.High,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,Landing Page Submission,Google,No,No,Converted to Lead,India,,Other,Unemployed,...,Might be,No,No,,Mumbai,02.Medium,01.High,No,No,Modified


### Missing values in the categorical variable after removing select

In [626]:
data[categorical].isnull().sum()

Prospect_ID                              0
Lead_Origin                              0
Lead_Source                             36
Do_Not_Email                             0
Do_Not_Call                              0
Last_Activity                          103
Country                               2461
Specialization                        3380
How_did_you_hear_about_X_Education    7250
Current_Occupation                    2690
Matters_Most_Choosing_Course          2709
Search                                   0
Magazine                                 0
Newspaper_Article                        0
X_Education_Forums                       0
Newspaper                                0
Digital_Advertisement                    0
Through_Recommendations                  0
Receive_More_Updates                     0
Tags                                  3353
Lead_Quality                          4767
Update_on_Supply_Chain_Content           0
Update_on_DM_Content                     0
Lead_Profil

In [627]:
#Before removing the select with Nan the null value were 1430 
# after removing the select with the Nan the null value were 3669
# These shows Nan is also calculated as the null value
data.City.isnull().sum()

3669

### Finding the categorical variable containing missing values

In [628]:
categorical_missing_values = [var for var in data.columns if data[var].isnull().sum() != 0]
print (data[categorical_missing_values].isnull().sum())

Lead_Source                             36
Total_Visits                           137
Page_Views_Per_Visit                   137
Last_Activity                          103
Country                               2461
Specialization                        3380
How_did_you_hear_about_X_Education    7250
Current_Occupation                    2690
Matters_Most_Choosing_Course          2709
Tags                                  3353
Lead_Quality                          4767
Lead_Profile                          6855
City                                  3669
Asymmetrique_Activity_Index           4218
Asymmetrique_Profile_Index            4218
Asymmetrique_Activity_Score           4218
Asymmetrique_Profile_Score            4218
dtype: int64


In [629]:
data.columns

Index(['Prospect_ID', 'Lead_Number', 'Lead_Origin', 'Lead_Source',
       'Do_Not_Email', 'Do_Not_Call', 'Converted', 'Total_Visits',
       'Total_Time_Spent_on_Website', 'Page_Views_Per_Visit', 'Last_Activity',
       'Country', 'Specialization', 'How_did_you_hear_about_X_Education',
       'Current_Occupation', 'Matters_Most_Choosing_Course', 'Search',
       'Magazine', 'Newspaper_Article', 'X_Education_Forums', 'Newspaper',
       'Digital_Advertisement', 'Through_Recommendations',
       'Receive_More_Updates', 'Tags', 'Lead_Quality',
       'Update_on_Supply_Chain_Content', 'Update_on_DM_Content',
       'Lead_Profile', 'City', 'Asymmetrique_Activity_Index',
       'Asymmetrique_Profile_Index', 'Asymmetrique_Activity_Score',
       'Asymmetrique_Profile_Score', 'Agree_to_Pay_Cheque',
       'Free_Copy_Mastering_The_Interview', 'Last_Notable_Activity'],
      dtype='object')

In [630]:
data.Country.unique()

array([nan, 'India', 'Russia', 'Kuwait', 'Oman', 'United Arab Emirates',
       'United States', 'Australia', 'United Kingdom', 'Bahrain', 'Ghana',
       'Singapore', 'Qatar', 'Saudi Arabia', 'Belgium', 'France',
       'Sri Lanka', 'China', 'Canada', 'Netherlands', 'Sweden', 'Nigeria',
       'Hong Kong', 'Germany', 'Asia/Pacific Region', 'Uganda', 'Kenya',
       'Italy', 'South Africa', 'Tanzania', 'unknown', 'Malaysia',
       'Liberia', 'Switzerland', 'Denmark', 'Philippines', 'Bangladesh',
       'Vietnam', 'Indonesia'], dtype=object)

In [631]:
data.Country.isnull().sum()

2461

In [632]:
data.Country.value_counts()

Country
India                   6492
United States             69
United Arab Emirates      53
Singapore                 24
Saudi Arabia              21
United Kingdom            15
Australia                 13
Qatar                     10
Hong Kong                  7
Bahrain                    7
Oman                       6
France                     6
unknown                    5
South Africa               4
Nigeria                    4
Germany                    4
Kuwait                     4
Canada                     4
Sweden                     3
China                      2
Asia/Pacific Region        2
Uganda                     2
Bangladesh                 2
Italy                      2
Belgium                    2
Netherlands                2
Ghana                      2
Philippines                2
Russia                     1
Switzerland                1
Vietnam                    1
Denmark                    1
Tanzania                   1
Liberia                    1
Malays

In [633]:
data.drop('Prospect_ID', axis = 1, inplace = True)

### Creating dummy variables for categorical variables

In [634]:

# Assuming 'data' is the original DataFrame containing your dataset
categorical_columns = ['Lead_Origin', 'Lead_Source', 'Do_Not_Email', 'Do_Not_Call', 'Last_Activity', 'Country', 'Specialization', 'How_did_you_hear_about_X_Education', 'Current_Occupation', 'Matters_Most_Choosing_Course', 'Search', 'Magazine', 'Newspaper_Article', 'X_Education_Forums', 'Newspaper', 'Digital_Advertisement', 'Through_Recommendations', 'Receive_More_Updates', 'Tags', 'Lead_Quality', 'Update_on_Supply_Chain_Content', 'Update_on_DM_Content', 'Lead_Profile', 'City', 'Asymmetrique_Activity_Index', 'Asymmetrique_Profile_Index', 'Agree_to_Pay_Cheque', 'Free_Copy_Mastering_The_Interview', 'Last_Notable_Activity']

# Create dummy variables for the categorical columns
df_dummies = pd.get_dummies(data[categorical_columns])
df_dummies = df_dummies.astype(int)

# Drop the original categorical columns from the 'data' DataFrame
data.drop(columns=categorical_columns, inplace=True)




In [635]:
df_dummies.head(5)

Unnamed: 0,Lead_Origin_API,Lead_Origin_Landing Page Submission,Lead_Origin_Lead Add Form,Lead_Origin_Lead Import,Lead_Origin_Quick Add Form,Lead_Source_Click2call,Lead_Source_Direct Traffic,Lead_Source_Facebook,Lead_Source_Google,Lead_Source_Live Chat,...,Last_Notable_Activity_Form Submitted on Website,Last_Notable_Activity_Had a Phone Conversation,Last_Notable_Activity_Modified,Last_Notable_Activity_Olark Chat Conversation,Last_Notable_Activity_Page Visited on Website,Last_Notable_Activity_Resubscribed to emails,Last_Notable_Activity_SMS Sent,Last_Notable_Activity_Unreachable,Last_Notable_Activity_Unsubscribed,Last_Notable_Activity_View in browser link Clicked
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


### filling dummy variables with mode

In [636]:
# Calculate the mode for each column in 'df_dummies'
modes = df_dummies.mode().iloc[0]

# Fill the missing values in each column with its corresponding mode
df_dummies.fillna(modes, inplace=True)


### finding the missing value of dummy variable

In [637]:
df_dummies.isnull().sum()

Lead_Origin_API                                       0
Lead_Origin_Landing Page Submission                   0
Lead_Origin_Lead Add Form                             0
Lead_Origin_Lead Import                               0
Lead_Origin_Quick Add Form                            0
                                                     ..
Last_Notable_Activity_Resubscribed to emails          0
Last_Notable_Activity_SMS Sent                        0
Last_Notable_Activity_Unreachable                     0
Last_Notable_Activity_Unsubscribed                    0
Last_Notable_Activity_View in browser link Clicked    0
Length: 204, dtype: int64

### Finding columns with numeric values

In [638]:
# Assuming 'data' is the DataFrame containing your dataset
numeric_data = [var for var in data.columns if data[var].dtype != 'O'] # dtype != 'O' selects non-object columns

print('The total non-categorical variables are {}'.format(len(numeric_data)))
print(numeric_data)


The total non-categorical variables are 7
['Lead_Number', 'Converted', 'Total_Visits', 'Total_Time_Spent_on_Website', 'Page_Views_Per_Visit', 'Asymmetrique_Activity_Score', 'Asymmetrique_Profile_Score']


### Understanding numeric variables

In [639]:
data[numeric_data].isnull().sum()

Lead_Number                       0
Converted                         0
Total_Visits                    137
Total_Time_Spent_on_Website       0
Page_Views_Per_Visit            137
Asymmetrique_Activity_Score    4218
Asymmetrique_Profile_Score     4218
dtype: int64

### Value count

In [640]:
data.Asymmetrique_Profile_Score.value_counts()

Asymmetrique_Profile_Score
15.0    1759
18.0    1071
16.0     599
17.0     579
20.0     308
19.0     245
14.0     226
13.0     204
12.0      22
11.0       9
Name: count, dtype: int64

In [641]:
data.Asymmetrique_Activity_Score.value_counts()

Asymmetrique_Activity_Score
14.0    1771
15.0    1293
13.0     775
16.0     467
17.0     349
12.0     196
11.0      95
10.0      57
9.0        9
18.0       5
8.0        4
7.0        1
Name: count, dtype: int64

In [642]:
data.Page_Views_Per_Visit.value_counts()

Page_Views_Per_Visit
0.00    2189
2.00    1795
3.00    1196
4.00     896
1.00     651
        ... 
3.43       1
2.56       1
6.33       1
1.64       1
2.08       1
Name: count, Length: 114, dtype: int64

In [643]:
data.Total_Visits.value_counts()

Total_Visits
0.0      2189
2.0      1680
3.0      1306
4.0      1120
5.0       783
6.0       466
1.0       395
7.0       309
8.0       224
9.0       164
10.0      114
11.0       86
13.0       48
12.0       45
14.0       36
16.0       21
15.0       18
17.0       16
18.0       15
20.0       12
19.0        9
21.0        6
23.0        6
24.0        5
25.0        5
27.0        5
22.0        3
29.0        2
28.0        2
26.0        2
141.0       1
55.0        1
30.0        1
43.0        1
74.0        1
41.0        1
54.0        1
115.0       1
251.0       1
32.0        1
42.0        1
Name: count, dtype: int64

In [644]:
data.Converted.value_counts()

Converted
0    5679
1    3561
Name: count, dtype: int64

In [645]:
data.Lead_Number.value_counts()

Lead_Number
660737    1
603303    1
602561    1
602557    1
602540    1
         ..
630422    1
630405    1
630403    1
630390    1
579533    1
Name: count, Length: 9240, dtype: int64

In [646]:
numeric = data[numeric_data]
numeric.head()


Unnamed: 0,Lead_Number,Converted,Total_Visits,Total_Time_Spent_on_Website,Page_Views_Per_Visit,Asymmetrique_Activity_Score,Asymmetrique_Profile_Score
0,660737,0,0.0,0,0.0,15.0,15.0
1,660728,0,5.0,674,2.5,15.0,15.0
2,660727,1,2.0,1532,2.0,14.0,20.0
3,660719,0,1.0,305,1.0,13.0,17.0
4,660681,1,2.0,1428,1.0,15.0,18.0


### finding null values

In [647]:
numeric.isnull().sum()

Lead_Number                       0
Converted                         0
Total_Visits                    137
Total_Time_Spent_on_Website       0
Page_Views_Per_Visit            137
Asymmetrique_Activity_Score    4218
Asymmetrique_Profile_Score     4218
dtype: int64

### Removing null values with Mean 

In [648]:
# Calculate the mode for each column in 'df_dummies'
mean = numeric.mean().iloc[0]

# Fill the missing values in each column with its corresponding mode
numeric.fillna(mean, inplace = True)

In [649]:
numeric.isnull().sum()

Lead_Number                    0
Converted                      0
Total_Visits                   0
Total_Time_Spent_on_Website    0
Page_Views_Per_Visit           0
Asymmetrique_Activity_Score    0
Asymmetrique_Profile_Score     0
dtype: int64

In [650]:
numeric.columns

Index(['Lead_Number', 'Converted', 'Total_Visits',
       'Total_Time_Spent_on_Website', 'Page_Views_Per_Visit',
       'Asymmetrique_Activity_Score', 'Asymmetrique_Profile_Score'],
      dtype='object')

### Data normalization

In [651]:
scaler = MinMaxScaler()
normalized_data = ['Lead_Number', 'Total_Visits', 'Total_Time_Spent_on_Website',
                   'Page_Views_Per_Visit', 'Asymmetrique_Activity_Score', 'Asymmetrique_Profile_Score']
numeric[numeric_data] = scaler.fit_transform(numeric[numeric_data])
numeric

Unnamed: 0,Lead_Number,Converted,Total_Visits,Total_Time_Spent_on_Website,Page_Views_Per_Visit,Asymmetrique_Activity_Score,Asymmetrique_Profile_Score
0,1.000000,0.0,0.000000,0.000000,0.000000,0.000013,0.000006
1,0.999889,0.0,0.000008,0.296655,0.000004,0.000013,0.000006
2,0.999877,1.0,0.000003,0.674296,0.000003,0.000011,0.000015
3,0.999778,0.0,0.000002,0.134243,0.000002,0.000010,0.000010
4,0.999310,1.0,0.000003,0.628521,0.000002,0.000013,0.000011
...,...,...,...,...,...,...,...
9235,0.000382,1.0,0.000013,0.812060,0.000004,0.000013,0.000010
9236,0.000160,0.0,0.000003,0.104754,0.000003,0.000011,0.000013
9237,0.000148,0.0,0.000003,0.087588,0.000003,0.000010,0.000015
9238,0.000062,1.0,0.000005,0.219630,0.000005,0.000013,0.000008


### Understanding data shape 

In [652]:
print(numeric.shape)
print(df_dummies.shape)


(9240, 7)
(9240, 204)


### Merging entire dataset

In [653]:
df = pd.concat([df_dummies, numeric], axis = 1)

### Understanding shape of new dataset

In [654]:
df.shape

(9240, 211)