In [None]:
#libraries to import 
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import scipy.stats as stats
import os
import sklearn

-- X1 Delivery speed---amount of time it takes to deliver the product once an
order has been confirmed

-- X2 Price level---perceived level of price charged by product suppliers

-- X3 Price flexibility---perceived willingness of HATCO representatives to
negotiate price on all types of purchases

-- X4 Manufacturer's image---overall image of the manufacturer/supplier

-- X5 Service---overall level of service necessary for maintaining a satisfactory
relationship between supplier and purchaser

-- X6 Salesforce's image---overall image of the manufacturer's sales force

-- X7 Product quality---perceived level of quality of a particular product (e.g.,
performance or yield) 

-- X8 Size of firm---size of the firm relative to others in this market. This
variable has two categories: 1=large, and 0=small 

-- X9 Usage level---how much of the firm's total product is purchased from
HATCO, measured on a 100-point percentage scale, ranging from 0 to 100
percent

-- X10 Satisfaction level---how satisfied the purchaser is with past purchases
from HATCO, measured on the same graphic rating scale as the perceptions
X1 to X7

-- X11 Specification buying---extent to which a particular purchaser evaluates
each purchase separately (total value analysis) versus the use of specification
buying, which details precisely the product characteristics desired. This
variable has two categories: 1=employs total value analysis approach,
evaluating each purchase separately, and 0=use of specification buying

-- X12 Structure of procurement---method of procuring/purchasing products
within a particular company. This variable has two categories: 1=centralized
procurement, and 0=decentralized procurement

-- X13 Type of industry---industry classification in which a product purchaser
belongs. This variable has two categories: 1=industry A classification, and
0=other industries

-- X14 Type of buying situation---type of situation facing the purchaser. This
variable has three categories: 1=new task, 2=modified rebuy, and 3=straight
rebuy

In [None]:
# to turn off scientific notation
pd.options.display.float_format = '{:.2f}'.format # to have formating of output to two decimal value 
# pd.set_option("display.precision", 2)

In [None]:
os.chdir(r'D:\BML Munjal University\Module 6\Predictive Analytics')
# reading data
data = pd.read_excel("hatco.xlsx")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.columns

In [None]:
average = [['X1', data['X1'].mean()], ['X2', data['X2'].mean()],['X3', data['X3'].mean()],
       ['X4', data['X4'].mean()], ['X5', data['X5'].mean()],['X6', data['X6'].mean()],
       ['X7', data['X7'].mean()]]
  
# Create the pandas DataFrame
average_data = pd.DataFrame(average, columns=['Feature', 'Average'])
average_data.head()

In [None]:
plt.figure(figsize=(20,7))
sns.barplot(data=average_data, x="Feature", y="Average")

We can see that the highest average is of X3, which means the perceived wilingness of HATCO representatives to negotiate price on all types of purchases is highest. The perceived level of quality of a particular product is nearly excellent. The perceived level of price charged by the product suppliers is very poor. It means that the suppliers think that the price charged by HATCO for the products are higher.

In [None]:
# Let 40% be the threshold for usage level

high_usage = data.loc[data['X9']>=40]
low_usage = data.loc[data['X9']<40]

In [None]:
len(low_usage)
#len(high_usage)

In [None]:
usage = [['High Usage',73],['Low Usage',27]]
usage = pd.DataFrame(usage,columns=['Usage', 'Count'])
usage

In [None]:
plt.figure(figsize=(20,7))
sns.barplot(data=usage, x="Usage", y="Count")

There are only a few respondents have low purchase from HATCO.

In [None]:
data_num = data.select_dtypes(exclude = 'object')
data_cat = data.select_dtypes(include = 'object')

In [None]:
data_num

In [None]:
data_norm = data_num.copy()
  
# apply normalization techniques
for column in data_norm.columns:
    data_norm[column] = data_norm[column]  / data_norm[column].abs().max()
      
# view normalized data
display(data_norm)

In [None]:
plt.figure(figsize=(15,10))
sns.boxplot(data=data_norm.drop(data_norm.columns[[0]], axis=1), orient="h")

We can see that there are only a few outliers in the dataset.

In [None]:
Q1 = np.percentile(data_norm['X5'], 25, interpolation = 'midpoint') 
Q2 = np.percentile(data_norm['X5'], 50, interpolation = 'midpoint') 
Q3 = np.percentile(data_norm['X5'], 75, interpolation = 'midpoint') 
IQR = Q3 - Q1 
low_lim1 = Q1 - 1.5 * IQR
up_lim1 = Q3 + 1.5 * IQR
print('low_limit is', low_lim1)
print('up_limit is', up_lim1)
outlier1 =[]
for x in data_norm['X5']:
    if ((x> up_lim1) or (x<low_lim1)):
         outlier1.append(x)
print(' Number of outlier in the dataset is', len(outlier1))

In [None]:
Q1 = np.percentile(data_norm['X4'], 25, interpolation = 'midpoint') 
Q2 = np.percentile(data_norm['X4'], 50, interpolation = 'midpoint') 
Q3 = np.percentile(data_norm['X4'], 75, interpolation = 'midpoint') 
IQR = Q3 - Q1 
low_lim2 = Q1 - 1.5 * IQR
up_lim2 = Q3 + 1.5 * IQR
print('low_limit is', low_lim2)
print('up_limit is', up_lim2)
outlier2 =[]
for x in data_norm['X4']:
    if ((x> up_lim2) or (x<low_lim2)):
         outlier2.append(x)
print(' Number of outlier in the dataset is', len(outlier2))

In [None]:
Q1 = np.percentile(data_norm['X6'], 25, interpolation = 'midpoint') 
Q2 = np.percentile(data_norm['X6'], 50, interpolation = 'midpoint') 
Q3 = np.percentile(data_norm['X6'], 75, interpolation = 'midpoint') 
IQR = Q3 - Q1 
low_lim3 = Q1 - 1.5 * IQR
up_lim3 = Q3 + 1.5 * IQR
print('low_limit is', low_lim3)
print('up_limit is', up_lim3)
outlier3 =[]
for x in data_norm['X6']:
    if ((x> up_lim3) or (x<low_lim3)):
         outlier3.append(x)
print(' Number of outlier in the dataset is', len(outlier3))

In [None]:
# Capping the outliers

data_norm["X6"] = np.where(data_norm["X6"]> up_lim3, up_lim3,
                        np.where(data_norm["X6"]< low_lim3, low_lim3,
                        data_norm["X6"]))
data_norm["X5"] = np.where(data_norm["X5"]> up_lim1, up_lim1,
                        np.where(data_norm["X5"]< low_lim1, low_lim1,
                        data_norm["X5"]))
data_norm["X4"] = np.where(data_norm["X4"]> up_lim2, up_lim2,
                        np.where(data_norm["X4"]< low_lim2, low_lim2,
                        data_norm["X4"]))

In [None]:
plt.figure(figsize=(15,10))
sns.boxplot(data=data_norm.drop(data_norm.columns[[0]], axis=1), orient="h")

In [None]:
def hist(variable):
    sns.histplot(data=data_norm, x=variable, kde=True)

def box(var):
    sns.boxplot(x=data_norm[var])

In [None]:
box("X1")

In [None]:
hist("X1")

In [None]:
hist("X2")

In [None]:
box("X2")

In [None]:
hist("X3")

In [None]:
box("X3")

In [None]:
hist("X4")

In [None]:
box("X4")

In [None]:
hist("X5")

In [None]:
box("X5")

In [None]:
hist("X6")

In [None]:
box("X6")

In [None]:
hist("X7")

In [None]:
box("X7")

In [None]:
hist("X9")

In [None]:
box("X9")

In [None]:
hist("X10")

In [None]:
box("X10")

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(data_norm.corr(), linewidths=0.5, annot=True, fmt=".2f", cmap = 'viridis')

In [None]:
plt.rcParams['figure.figsize']=(15,10)
data_num.hist(bins = 30)
plt.show()

In [None]:
data_norm.isnull().sum()

In [None]:
low_sat = data.loc[data['X10']<=5]
high_sat = data.loc[data['X10']>5]
print("low ",len(low_sat))
print("high ",len(high_sat))

In [None]:
# Let us look at the purchasers who have low satisfaction with the past purchases.

low_sat.describe()

In [None]:
plt.figure(figsize=(15,10))
sns.boxplot(data=low_sat.drop(low_sat.columns[[0]], axis=1), orient="h")

One of the respondents receieved very low customer service (X5) due to which he is low satisfied.

Even though the delivery time of a respondent was very low (X1), still he is not satisfied. The company should probe more on this respondent.

In [None]:
pd.crosstab(data['X8'], [data['X11']])

There are no small firms that use specification buying. There are no large firm which employ total value analysis approach. There are 60 small firms which employ total value analysis approach, there are 40 large firms which employ specification buying.

In [None]:
pd.crosstab(data['X8'], [data['X12']])

No large firms have decentralized procurement structure. 50 small firms have decentralized procurement structure. Only 10 small firms have centralized procurement structure. 

In [None]:
pd.crosstab(data['X8'], [data['X13']])

Equal number of small and large firms belong to industry A classification and other classification of industries.

Problem Statment 1 - To predict the usage level of the firms
Problem Statment 2 - To predict the type of buying situation

# To predict the usage level of the firms

In [None]:
data_norm

In [None]:
sns.pairplot(data_norm,hue ='X13')
plt.show()

In [None]:
data_norm.info()

In [None]:
x1 = data_norm.drop(data_norm.columns[[9]], axis=1)
y1 = data_norm['X9']
x1.info()

In [None]:
#Step 4: Split the data into x_train and x_test, y_train and y_test
from sklearn.model_selection import train_test_split

x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.2, random_state=50)


In [None]:
print(x1_train.shape)
print(x1_test.shape)
print(y1_train.shape)
print(y1_test.shape)

In [None]:
#Step 5: Train the model using x_train and y_train
import statsmodels.api as sm
lin_reg_model_1 = sm.OLS(y1_train, x1_train).fit()
lin_reg_model_1.summary()

In [None]:
#Step 6: Pass x_test to the model to predict y.
predictions_model_1 = lin_reg_model_1.predict(x1_test)
predictions_model_1

In [None]:
y1_test
#This is not close to the historical y. So now you do RMSE

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y1_test, predictions_model_1)
rmse = np.sqrt(mse)
rmse

# Removing variables with p value less than 0.05

In [None]:
data_norm.info()

In [None]:
# Removed only ID and Manufacturer's image. Others seem to be significant.
x2 = data_norm.drop(data_norm.columns[[9,0,4]], axis=1)
y2 = data_norm['X9']
x2.info()

In [None]:
#Step 4: Split the data into x_train and x_test, y_train and y_test
from sklearn.model_selection import train_test_split

x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.2, random_state=50)


In [None]:
print(x2_train.shape)
print(x2_test.shape)
print(y2_train.shape)
print(y2_test.shape)

In [None]:
import statsmodels.api as sm
lin_reg_model_2 = sm.OLS(y2_train, x2_train).fit()
lin_reg_model_2.summary()

In [None]:
predictions_model_2 = lin_reg_model_2.predict(x2_test)
predictions_model_2

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y2_test, predictions_model_2)
rmse = np.sqrt(mse)
rmse

# VIF treatment

In [None]:
X = sm.add_constant(x1)
X.head()

In [None]:
X.columns

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
  
# the independent variables set
X1 = X[['ID', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X10',
       'X11', 'X12', 'X13', 'X14']]
  
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X1.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X1.values, i)
                          for i in range(len(X1.columns))]
  
print(vif_data)

In [None]:
# If we remove service (X5)
  
# the independent variables set
X2 = X1[['X1', 'X2', 'X3', 'X4', 'X11', 'X6', 'X7', 'X8',
         'X10','X12', 'X13', 'X14']]
  
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X2.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X2.values, i)
                          for i in range(len(X2.columns))]
  
print(vif_data)

In [None]:
# If we remove structure of procurement (X12)

# the independent variables set
X3 = X2[['X1', 'X2', 'X3', 'X4', 'X6','X11', 'X7', 'X8',
         'X10','X13','X14']]
  
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X3.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X3.values, i)
                          for i in range(len(X3.columns))]
  
print(vif_data)

In [None]:
X3.info()

In [None]:
X3= pd.concat([X3,data_norm['X9']], axis = 1)
X3.info()

In [None]:
from statsmodels.formula.api import ols
lm = ols("X9 ~ X1+X2+X3+X4+X6+X11+X7+X8+X10+X13+X14", data=X3).fit()
print(lm.summary())

Performance decreased.

# Using cook's distance

In [None]:
from yellowbrick.regressor import CooksDistance
from yellowbrick.datasets import load_concrete

# Load the regression dataset
x1, y1 = load_concrete()

# Instantiate and fit the visualizer
visualizer = CooksDistance()
visualizer.fit(x1, y1)
visualizer.show()

The presence of so many highly influential points suggests that linear regression may not be suitable for this dataset. One or more of the four assumptions behind linear regression might be being violated; namely one of: independence of observations, linearity of response, normality of residuals, or homogeneity of variance (“homoscedasticity”).

In [None]:
from sklearn.linear_model import LinearRegression
from yellowbrick.regressor import ResidualsPlot

# Instantiate and fit the visualizer
model3 = LinearRegression()
visualizer_residuals = ResidualsPlot(model3)
visualizer_residuals.fit(x1, y1)
visualizer_residuals.show()

The residuals appear to be normally distributed around 0, satisfying the linearity and normality conditions. However, they do skew slightly positive for larger predicted values, and also appear to increase in magnitude as the predicted value increases, suggesting a violation of the homoscedasticity condition.

In [None]:
data_norm.columns

In [None]:
lm = ols("X9 ~ X1+X2+X3+X4+X5+X6+X7+X8+X11+X10+X12+X13+X14 ", data=data_norm).fit()

In [None]:
# print regression results
print(lm.summary())

In [None]:
fig = sm.graphics.influence_plot(lm, criterion="cooks")
fig.tight_layout(pad=1.0)

In [None]:
# obtain Cook's distance 
lm_cooksd = lm.get_influence().cooks_distance[0]

In [None]:
n = len(data_norm["X1"])
print(n)

In [None]:
# calculate critical d
critical_d = 4/n
print('Critical Cooks distance:', critical_d)

In [None]:
out_d = lm_cooksd > critical_d

In [None]:
print(data_norm.index[out_d], "\n", 
    lm_cooksd[out_d])

In [None]:
data_norm.loc[[21, 29, 52, 54, 81, 99],]

In [None]:
new_data = data_norm.drop([21,29,52,54,81,99])

In [None]:
new_data.info()

In [None]:
x4 = new_data.drop('X9',axis=1)
y4 = new_data['X9']
x4.head()

In [None]:
#Step 4: Split the data into x_train and x_test, y_train and y_test
from sklearn.model_selection import train_test_split

x4_train, x4_test, y4_train, y4_test = train_test_split(x4, y4, test_size=0.2, random_state=50)


In [None]:
print(x4_train.shape)
print(x4_test.shape)
print(y4_train.shape)
print(y4_test.shape)

In [None]:
import statsmodels.api as sm
lin_reg_model_4 = sm.OLS(y4_train, x4_train).fit()
lin_reg_model_4.summary()

In [None]:
predictions_model_4 = lin_reg_model_4.predict(x4_test)
predictions_model_4

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y4_test, predictions_model_4)
rmse = np.sqrt(mse)
rmse

# PCA

In [None]:
# importing or loading the dataset
dataset = pd.read_excel('HATCO.xlsx')
  
# distributing the dataset into two components X and Y
X = dataset.iloc[:, [1,2,3,4,5,6,7,8,9,10,11,12,13,14]].values
y = dataset.iloc[:, 8].values

In [None]:
# Splitting the X and Y into the
# Training set and Testing set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [None]:
# performing preprocessing part
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [None]:
# Applying PCA function on training
# and testing set of X component
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)

X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

explained_variance = pca.explained_variance_ratio_


In [None]:
explained_variance

In [None]:
# Fitting Logistic Regression To the training set
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)


In [None]:
# Predicting the test set result using
# predict function under LogisticRegression
y_pred = classifier.predict(X_test)


In [None]:
# making confusion matrix between
# test set of Y and predicted value.
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)


In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
s1 = sns.heatmap(cm ,annot=True ,fmt='d')
s1.set(xlabel='Predicted', ylabel='Actual')
print("Model accuracy for model 1:",accuracy_score(y_test, y_pred))

In [None]:
# Predicting the training set
# result through scatter plot
from matplotlib.colors import ListedColormap

X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1,
					stop = X_set[:, 0].max() + 1, step = 0.01),
					np.arange(start = X_set[:, 1].min() - 1,
					stop = X_set[:, 1].max() + 1, step = 0.01))

plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(),
			X2.ravel()]).T).reshape(X1.shape), alpha = 0.75,
			cmap = ListedColormap(('yellow', 'white', 'aquamarine')))

plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())

for i, j in enumerate(np.unique(y_set)):
	plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
				c = ListedColormap(('red', 'green', 'blue'))(i), label = j)

plt.title('Logistic Regression (Training set)')
plt.xlabel('PC1') # for Xlabel
plt.ylabel('PC2') # for Ylabel
plt.legend() # to show legend

# show scatter plot
plt.show()


In [None]:
# Visualising the Test set results through scatter plot
from matplotlib.colors import ListedColormap

X_set, y_set = X_test, y_test

X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1,
					stop = X_set[:, 0].max() + 1, step = 0.01),
					np.arange(start = X_set[:, 1].min() - 1,
					stop = X_set[:, 1].max() + 1, step = 0.01))

plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(),
			X2.ravel()]).T).reshape(X1.shape), alpha = 0.75,
			cmap = ListedColormap(('yellow', 'white', 'aquamarine')))

plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())

for i, j in enumerate(np.unique(y_set)):
	plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
				c = ListedColormap(('red', 'green', 'blue'))(i), label = j)

# title for scatter plot
plt.title('Logistic Regression (Test set)')
plt.xlabel('PC1') # for Xlabel
plt.ylabel('PC2') # for Ylabel
plt.legend()

# show scatter plot
plt.show()


# Cluster Analysis

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.metrics import silhouette_score

In [None]:
kmeans = KMeans(n_clusters=2, init="k-means++",random_state=0)
kmeans.fit(data_norm)

In [None]:
kmeans.inertia_

In [None]:
from sklearn.cluster import KMeans

wcss = []
for k in range(2,11):
    kmeans = KMeans(n_clusters=k, init="k-means++",random_state=42)
    kmeans.fit(data_norm)
    wcss.append(kmeans.inertia_)
plt.figure(figsize=(12,6))    
plt.grid()
plt.plot(range(2,11),wcss, linewidth=2, color="red", marker ="8")
plt.xlabel("K Value")
plt.xticks(np.arange(1,11,1))
plt.ylabel("WCSS")
plt.show()

In [None]:
km = KMeans(n_clusters=5,random_state = 42)
clusters = km.fit_predict(data_norm)
clusters

In [None]:
data_norm["label"] = clusters
data_norm

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(data_norm.X1[data_norm.label == 0], data_norm["X2"][data_norm.label == 0], data_norm["X3"][data_norm.label == 0], c='blue', s=60)
ax.scatter(data_norm.X1[data_norm.label == 1], data_norm["X2"][data_norm.label == 1], data_norm["X3"][data_norm.label == 1], c='red', s=60)
ax.scatter(data_norm.X1[data_norm.label == 2], data_norm["X2"][data_norm.label == 2], data_norm["X3"][data_norm.label == 2], c='green', s=60)
ax.scatter(data_norm.X1[data_norm.label == 3], data_norm["X2"][data_norm.label == 3], data_norm["X3"][data_norm.label == 3], c='orange', s=60)
ax.scatter(data_norm.X1[data_norm.label == 4], data_norm["X2"][data_norm.label == 4], data_norm["X3"][data_norm.label == 4], c='purple', s=60)
ax.view_init(30, 185)
plt.xlabel("Delivery Speed")
plt.ylabel("Price Level")
ax.set_zlabel('Price Flexibility')
plt.show()

Calculate centroids, and interpret the centroids of each cluster.

In [None]:
data_norm.groupby("label")['X1'].mean().plot.bar() # Delivery speed

In [None]:
data_norm.groupby("label")['X2'].mean().plot.bar() #Price level

In [None]:
data_norm.groupby("label")['X3'].mean().plot.bar() # Price flexibility

In [None]:
# Using 6 clusters
km = KMeans(n_clusters=6,random_state = 42)
clusters = km.fit_predict(data_norm)
clusters

In [None]:
data_norm["label"] = clusters
data_norm

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(data_norm.X1[data_norm.label == 0], data_norm["X2"][data_norm.label == 0], data_norm["X3"][data_norm.label == 0], c='blue', s=60)
ax.scatter(data_norm.X1[data_norm.label == 1], data_norm["X2"][data_norm.label == 1], data_norm["X3"][data_norm.label == 1], c='red', s=60)
ax.scatter(data_norm.X1[data_norm.label == 2], data_norm["X2"][data_norm.label == 2], data_norm["X3"][data_norm.label == 2], c='green', s=60)
ax.scatter(data_norm.X1[data_norm.label == 3], data_norm["X2"][data_norm.label == 3], data_norm["X3"][data_norm.label == 3], c='orange', s=60)
ax.scatter(data_norm.X1[data_norm.label == 4], data_norm["X2"][data_norm.label == 4], data_norm["X3"][data_norm.label == 4], c='purple', s=60)
ax.scatter(data_norm.X1[data_norm.label == 5], data_norm["X2"][data_norm.label == 5], data_norm["X3"][data_norm.label == 5], c='black', s=60)
ax.view_init(30, 185)
plt.xlabel("Delivery Speed")
plt.ylabel("Price Level")
ax.set_zlabel('Price Flexibility')
plt.show()

In [None]:
data_norm.groupby("label")['X1'].mean().plot.bar()

In [None]:
data_norm.groupby("label")['X2'].mean().plot.bar()

In [None]:
data_norm.groupby("label")['X3'].mean().plot.bar()

In [None]:
data_norm.info()

In [None]:
from sklearn import cluster
dd = data_norm.loc[:,["X1","X2","X3","X4","X5","X6","X7","X8"]]
dd1 = dd.drop("X8",axis = 1)
k_means = cluster.KMeans(n_clusters=2, max_iter=50, random_state=1)
k_means.fit(dd1) 
labels = k_means.labels_
pd.DataFrame(labels, index=dd.X8, columns=['Cluster ID'])

In [None]:
centroids = k_means.cluster_centers_
pd.DataFrame(centroids,columns=dd1.columns)

Those respondents who have a perception of/higher ratings for:
High:
Delivery speed
Price flexibility
Manufacturer's image
Product quality
Service
Low:
Price level
Salesforce image
are small firms.

Those respondents who have a perception of:
High:
Price level
Manufacturer's image
Price flexibility
Service
Salesforce image
Product quality
Low:
Delivery speed
are large firms

In [None]:
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

numClusters = [1,2,3,4,5,6]
SSE = []
for k in numClusters:
    k_means = cluster.KMeans(n_clusters=k)
    k_means.fit(data)
    SSE.append(k_means.inertia_)

plt.plot(numClusters, SSE)
plt.xlabel('Number of Clusters')
plt.ylabel('SSE')

# CART

In [None]:
data_norm.info()

In [None]:
# Removed only ID and Manufacturer's image. Others seem to be significant.
x5 = data_norm.drop(data_norm.columns[[9,0]], axis=1)
y5 = data_norm['X9']
x5.info()

In [None]:
x5_train, x5_test, y5_train, y5_test = train_test_split(x5, y5, test_size=0.2, random_state=100)

In [None]:
from sklearn.tree import DecisionTreeRegressor 

In [None]:
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0) 


In [None]:
# fit the regressor with X and Y data
regressor = regressor.fit(x5_train, y5_train)


In [None]:
y_pred = regressor.predict(x5_test)
y_pred

In [None]:
from sklearn.metrics import mean_squared_error

mse3 = mean_squared_error(y5_test, y_pred)
rmse3 = np.sqrt(mse3)
print("RMSE:",rmse3)
print("MSE:",mse3)
