# Import packages

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from pylab import *
import seaborn as sns
from sklearn.utils import shuffle
%matplotlib inline
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Import Data

In [2]:
Travel = pd.read_csv("Travel.csv")

## Question Setup: Is there a difference in age, monthly income, number of follow up, pitch duration among those who taken a product or not. 

In [4]:
Travel.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [5]:
Travel.dropna(inplace=True)

In [21]:
TravelDrop=Travel[["ProdTaken", "NumberOfTrips", "NumberOfPersonVisiting", "NumberOfFollowups", "Age", "DurationOfPitch", "MonthlyIncome"]]

In [22]:
TravelDrop.shape 

(4128, 7)

In [23]:
TravelDrop.head()

Unnamed: 0,ProdTaken,NumberOfTrips,NumberOfPersonVisiting,NumberOfFollowups,Age,DurationOfPitch,MonthlyIncome
0,1,1.0,3,3.0,41.0,6.0,20993.0
1,0,2.0,3,4.0,49.0,14.0,20130.0
2,1,7.0,3,4.0,37.0,8.0,17090.0
3,0,2.0,2,3.0,33.0,9.0,17909.0
5,0,1.0,3,3.0,32.0,8.0,18068.0


## Check data types of remaining column 

In [24]:
TravelDrop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4128 entries, 0 to 4887
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ProdTaken               4128 non-null   int64  
 1   NumberOfTrips           4128 non-null   float64
 2   NumberOfPersonVisiting  4128 non-null   int64  
 3   NumberOfFollowups       4128 non-null   float64
 4   Age                     4128 non-null   float64
 5   DurationOfPitch         4128 non-null   float64
 6   MonthlyIncome           4128 non-null   float64
dtypes: float64(5), int64(2)
memory usage: 258.0 KB


## Converting datatype 

In [25]:
TravelDrop.Age = np.round (TravelDrop.Age)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [26]:
TravelDrop.NumberOfTrips = TravelDrop.NumberOfTrips.astype(int)
TravelDrop.Age = TravelDrop.Age.astype(int)
TravelDrop.DurationOfPitch = TravelDrop.DurationOfPitch.astype(int)
TravelDrop.MonthlyIncome = TravelDrop.MonthlyIncome.astype(int)

In [27]:
TravelDrop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4128 entries, 0 to 4887
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ProdTaken               4128 non-null   int64  
 1   NumberOfTrips           4128 non-null   int32  
 2   NumberOfPersonVisiting  4128 non-null   int64  
 3   NumberOfFollowups       4128 non-null   float64
 4   Age                     4128 non-null   int32  
 5   DurationOfPitch         4128 non-null   int32  
 6   MonthlyIncome           4128 non-null   int32  
dtypes: float64(1), int32(4), int64(2)
memory usage: 193.5 KB


### Defining X and Y

In [32]:
x_columns = ["NumberOfTrips", "NumberOfPersonVisiting", "NumberOfFollowups", "Age", "DurationOfPitch", "MonthlyIncome"]

In [33]:
y = TravelDrop['ProdTaken']

## Creating function to get model statistics

In [34]:
def get_stats():
    x = TravelDrop[x_columns]
    results = sm.OLS(y, x).fit()
    print(results.summary())

In [35]:
get_stats()

                                 OLS Regression Results                                
Dep. Variable:              ProdTaken   R-squared (uncentered):                   0.222
Model:                            OLS   Adj. R-squared (uncentered):              0.221
Method:                 Least Squares   F-statistic:                              196.4
Date:                Mon, 06 Dec 2021   Prob (F-statistic):                   9.74e-221
Time:                        19:02:46   Log-Likelihood:                         -1943.8
No. Observations:                4128   AIC:                                      3900.
Df Residuals:                    4122   BIC:                                      3938.
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                             coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

NumberOfPersonVisiting has really high p-value. We are dropping this variable.

## Remove Monthly Income

In [36]:
x_columns.remove("NumberOfPersonVisiting")

## Rerun test without MonthlyIncome

In [37]:
def get_stats():
    x = TravelDrop[x_columns]
    results = sm.OLS(y, x).fit()
    print(results.summary())
get_stats()

                                 OLS Regression Results                                
Dep. Variable:              ProdTaken   R-squared (uncentered):                   0.222
Model:                            OLS   Adj. R-squared (uncentered):              0.221
Method:                 Least Squares   F-statistic:                              235.6
Date:                Mon, 06 Dec 2021   Prob (F-statistic):                   7.89e-222
Time:                        19:03:15   Log-Likelihood:                         -1944.1
No. Observations:                4128   AIC:                                      3898.
Df Residuals:                    4123   BIC:                                      3930.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                        coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------

All of the remaining p-values are really low.  we are going to conclude that these variables are highly correlated with ProdTaken

### Exam means of variable 

In [38]:
TravelDrop.groupby('ProdTaken')['NumberOfTrips', 'NumberOfFollowups', 'Age', 'DurationOfPitch', 'MonthlyIncome'].mean()

  TravelDrop.groupby('ProdTaken')['NumberOfTrips', 'NumberOfFollowups', 'Age', 'DurationOfPitch', 'MonthlyIncome'].mean()


Unnamed: 0_level_0,NumberOfTrips,NumberOfFollowups,Age,DurationOfPitch,MonthlyIncome
ProdTaken,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3.281297,3.686881,37.927049,15.247073,23472.894626
1,3.353827,3.969887,34.326223,16.996236,21947.91468
