## Import packages

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from pylab import *
import seaborn as sns
from sklearn.utils import shuffle
%matplotlib inline
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## Import Data

In [2]:
Travel = pd.read_csv("Travel.csv")

## Question Setup: Is there a difference in the number of visitors among the Occupation? 

## Data Wrangling 

In [3]:
Travel['ProdTaken'].value_counts()

0    3968
1     920
Name: ProdTaken, dtype: int64

Remove NA values

In [4]:
TravelDrop = Travel.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [5]:
TravelDrop.describe()

Unnamed: 0,CustomerID,ProdTaken,Age,CityTier,DurationOfPitch,NumberOfPersonVisiting,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,MonthlyIncome
count,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0
mean,202527.763808,0.193072,37.231831,1.663275,15.584787,2.94937,3.741521,3.578488,3.2953,0.2953,3.060804,0.612161,1.223595,23178.464147
std,1409.439133,0.394757,9.174521,0.92064,8.398142,0.718818,1.006786,0.795031,1.8563,0.456233,1.363064,0.487317,0.852685,4506.614622
min,200000.0,0.0,18.0,1.0,5.0,1.0,1.0,3.0,1.0,0.0,1.0,0.0,0.0,1000.0
25%,201320.75,0.0,31.0,1.0,9.0,2.0,3.0,3.0,2.0,0.0,2.0,0.0,1.0,20751.0
50%,202603.5,0.0,36.0,1.0,14.0,3.0,4.0,3.0,3.0,0.0,3.0,1.0,1.0,22418.0
75%,203748.25,0.0,43.0,3.0,20.0,3.0,4.0,4.0,4.0,1.0,4.0,1.0,2.0,25301.0
max,204887.0,1.0,61.0,3.0,127.0,5.0,6.0,5.0,22.0,1.0,5.0,1.0,3.0,98678.0


Check numbers of customers Taken Product vs those who did not

In [6]:
TravelDrop['ProdTaken'].value_counts()

0    3331
1     797
Name: ProdTaken, dtype: int64

Since the majority of customer did not take product, we will only exam those who did in this analysis.

In [7]:
ProdTakendf = TravelDrop[TravelDrop["ProdTaken"] == 1]

In [8]:
ProdTakendf.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
14,200014,1,28.0,Self Enquiry,1,30.0,Salaried,Male,2,4.0,Basic,3.0,Single,6.0,1,2,0,0.0,Executive,17028.0
24,200024,1,34.0,Self Enquiry,1,12.0,Small Business,Male,2,3.0,Basic,5.0,Single,3.0,0,2,1,1.0,Executive,17960.0
33,200033,1,39.0,Self Enquiry,3,11.0,Large Business,Male,2,3.0,Deluxe,3.0,Divorced,4.0,0,2,0,1.0,Manager,17086.0


## Subsetting to Only the Variables Needed

In [9]:
Trips_df=ProdTakendf[["NumberOfTrips", "PitchSatisfactionScore", "PreferredPropertyStar", "NumberOfFollowups", "DurationOfPitch", "Age", "NumberOfChildrenVisiting"]]

In [10]:
Trips_df.shape 

(797, 7)

In [11]:
Trips_df.head()

Unnamed: 0,NumberOfTrips,PitchSatisfactionScore,PreferredPropertyStar,NumberOfFollowups,DurationOfPitch,Age,NumberOfChildrenVisiting
0,1.0,2,3.0,3.0,6.0,41.0,0.0
2,7.0,3,3.0,4.0,8.0,37.0,0.0
14,6.0,2,3.0,4.0,30.0,28.0,0.0
24,3.0,2,5.0,3.0,12.0,34.0,1.0
33,4.0,2,3.0,3.0,11.0,39.0,1.0


## Check data types of remaining column 

In [12]:
Trips_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 797 entries, 0 to 4887
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   NumberOfTrips             797 non-null    float64
 1   PitchSatisfactionScore    797 non-null    int64  
 2   PreferredPropertyStar     797 non-null    float64
 3   NumberOfFollowups         797 non-null    float64
 4   DurationOfPitch           797 non-null    float64
 5   Age                       797 non-null    float64
 6   NumberOfChildrenVisiting  797 non-null    float64
dtypes: float64(6), int64(1)
memory usage: 49.8 KB


## Converting datatype 

In [13]:
Trips_df.Age = np.round (Trips_df.Age)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [14]:
Trips_df.NumberOfTrips = Trips_df.NumberOfTrips.astype(int)
Trips_df.PreferredPropertyStar = Trips_df.PreferredPropertyStar.astype(int)
Trips_df.PitchSatisfactionScore = Trips_df.PitchSatisfactionScore.astype(int)
Trips_df.Age = Trips_df.Age.astype(int)
Trips_df.NumberOfFollowups = Trips_df.NumberOfFollowups.astype(int)
Trips_df.DurationOfPitch = Trips_df.DurationOfPitch.astype(int)
Trips_df.NumberOfChildrenVisiting = Trips_df.NumberOfChildrenVisiting.astype(int)

In [15]:
Trips_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 797 entries, 0 to 4887
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   NumberOfTrips             797 non-null    int32
 1   PitchSatisfactionScore    797 non-null    int32
 2   PreferredPropertyStar     797 non-null    int32
 3   NumberOfFollowups         797 non-null    int32
 4   DurationOfPitch           797 non-null    int32
 5   Age                       797 non-null    int32
 6   NumberOfChildrenVisiting  797 non-null    int32
dtypes: int32(7)
memory usage: 28.0 KB


### Defining X and Y

In [16]:
x_columns  = ["PitchSatisfactionScore", "PreferredPropertyStar", "NumberOfFollowups", "NumberOfChildrenVisiting","DurationOfPitch", "Age"]
y = Trips_df['NumberOfTrips']

In [17]:
y.head()

0     1
2     7
14    6
24    3
33    4
Name: NumberOfTrips, dtype: int32

## Creating function to get model statistics

In [18]:
def get_stats():
    x = Trips_df[x_columns]
    results = sm.OLS(y, x).fit()
    print(results.summary())

In [19]:
get_stats()

                                 OLS Regression Results                                
Dep. Variable:          NumberOfTrips   R-squared (uncentered):                   0.737
Model:                            OLS   Adj. R-squared (uncentered):              0.735
Method:                 Least Squares   F-statistic:                              369.8
Date:                Sat, 04 Dec 2021   Prob (F-statistic):                   1.26e-225
Time:                        14:14:22   Log-Likelihood:                         -1691.5
No. Observations:                 797   AIC:                                      3395.
Df Residuals:                     791   BIC:                                      3423.
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                               coef    std err          t      P>|t|      [0.025      0.975]
---------------------------

Variables with the highest p-value > 0.05 is NumberOfChildrenVisiting (0.296).

## Drop insignification variables

### Drop NumberOfChildrenVisiting

In [20]:
x_columns.remove("NumberOfChildrenVisiting")

In [21]:
y = Trips_df['NumberOfTrips']

### Creating function to get model statistics

In [22]:
def get_stats():
    x = Trips_df[x_columns]
    results = sm.OLS(y, x).fit()
    print(results.summary())

In [23]:
get_stats()

                                 OLS Regression Results                                
Dep. Variable:          NumberOfTrips   R-squared (uncentered):                   0.728
Model:                            OLS   Adj. R-squared (uncentered):              0.726
Method:                 Least Squares   F-statistic:                              423.7
Date:                Sat, 04 Dec 2021   Prob (F-statistic):                   5.50e-221
Time:                        14:14:23   Log-Likelihood:                         -1705.4
No. Observations:                 797   AIC:                                      3421.
Df Residuals:                     792   BIC:                                      3444.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                             coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

Variables with the highest p-value > 0.05 is NumberOfFollowups (0.232).

### Drop  NumberOfFollowups

In [24]:
x_columns.remove("NumberOfFollowups")

### Creating function to get model statistics

In [25]:
def get_stats():
    x = Trips_df[x_columns]
    results = sm.OLS(y, x).fit()
    print(results.summary())

In [26]:
get_stats()

                                 OLS Regression Results                                
Dep. Variable:          NumberOfTrips   R-squared (uncentered):                   0.716
Model:                            OLS   Adj. R-squared (uncentered):              0.715
Method:                 Least Squares   F-statistic:                              500.8
Date:                Sat, 04 Dec 2021   Prob (F-statistic):                   2.79e-215
Time:                        14:18:25   Log-Likelihood:                         -1721.8
No. Observations:                 797   AIC:                                      3452.
Df Residuals:                     793   BIC:                                      3470.
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                             coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

Variables with the highest p-value > 0.05 is PreferredPropertyStar (0.276).

## Drop PreferredPropertyStar

In [28]:
x_columns.remove("PreferredPropertyStar")

### Creating function to get model statistics

In [29]:
def get_stats():
    x = Trips_df[x_columns]
    results = sm.OLS(y, x).fit()
    print(results.summary())

In [30]:
get_stats()

                                 OLS Regression Results                                
Dep. Variable:          NumberOfTrips   R-squared (uncentered):                   0.702
Model:                            OLS   Adj. R-squared (uncentered):              0.700
Method:                 Least Squares   F-statistic:                              622.0
Date:                Sat, 04 Dec 2021   Prob (F-statistic):                   6.69e-208
Time:                        14:20:37   Log-Likelihood:                         -1742.2
No. Observations:                 797   AIC:                                      3490.
Df Residuals:                     794   BIC:                                      3504.
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                             coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

The p-value of the remaining variable are less than 0.05 except for Age =0.05.  We Will exam the mean for these variable.