In [319]:
# Import packages
import numpy as np
import pandas as pd

# Import Scikit-learn (for Data Pre-Processing and Linear Regression)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# Import StatsModels (for Regression Evaluation)
import statsmodels.api as sm

# Ignore user warnings
import warnings
warnings.filterwarnings ("ignore", category = UserWarning)


In [320]:
# Import clean data
df = pd.read_csv("/Users/evolkin/Library/CloudStorage/GoogleDrive-evolkin@u.rochester.edu/My Drive/Semester 8/CIS 242: Predictive Analytics and Machine Learning/CIS 242 Team Project/shark_tank_data_clean.csv")

In [321]:
# Inspect dataframe by printing the head
df.head()

Unnamed: 0.1,Unnamed: 0,deal_True,category,askedfor,exchangeforstake,valuation,title,multEntr_True
0,0,0,Novelties,1000000,15,6666667,Ionic Ear,0
1,1,1,Specialty Food,460000,10,4600000,Mr. Tod's Pie Factory,0
2,2,1,Baby and Child Care,50000,15,333333,Ava the Elephant,0
3,3,0,Consumer Services,250000,25,1000000,College Foxes Packing Boxes,0
4,4,0,Consumer Services,1200000,10,12000000,Wispots,0


Make a logistic regression model that calculates the likelihood of a deal being reached given each industry

In [322]:
# Group by whether a deal was struck
df_grouped = df.groupby ("deal_True")

In [323]:
# Output the mean values of the grouped data.
df_grouped.mean ()

Unnamed: 0_level_0,Unnamed: 0,askedfor,exchangeforstake,valuation,multEntr_True
deal_True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,236.254098,289319.672131,18.405738,2393574.0,0.303279
1,257.446215,228521.912351,16.697211,1944013.0,0.346614


In [324]:
# Output the median values of the grouped data.
df_grouped.median ()

Unnamed: 0_level_0,Unnamed: 0,askedfor,exchangeforstake,valuation,multEntr_True
deal_True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,226.5,150000.0,20.0,966666.5,0.0
1,261.0,125000.0,15.0,1000000.0,0.0


In [325]:
## Convert categorical industry data into dummies

# See values in the education variable
df.category.unique()

# Store the dummy column df in a variable titled  "dummies_edu"
dummies_ind = pd.get_dummies (df ["category"], prefix = "ind") 

# Remove the slash in variable name
dummies_ind.columns = [c.replace('/', '_') for c in dummies_ind.columns]

In [326]:
# Output the head of the dummies
dummies_ind.head ()

Unnamed: 0,ind_Alcoholic Beverages,ind_Automotive,ind_Baby and Child Care,ind_Baby and Children's Apparel and Accessories,ind_Baby and Children's Bedding,ind_Baby and Children's Entertainment,ind_Baby and Children's Food,ind_Consumer Services,ind_Costumes,ind_Cycling,...,ind_Specialty Food,ind_Storage and Cleaning Products,ind_Toys and Games,ind_Undergarments and Basics,ind_Water Bottles,ind_Weddings,ind_Wine Accessories,ind_Women's Accessories,ind_Women's Apparel,ind_Women's Shoes
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [327]:
# Remove the original industry variable and other unnecessary columns
featuresToRemove = []
featuresToRemove.append ("category")
featuresToRemove.append ("title")
#featuresToRemove.append("askedfor")
#featuresToRemove.append("exchangeforstake")
#featuresToRemove.append("valuation")
featuresToRemove.append("Unnamed: 0")
#featuresToRemove.append("multEntr_True")

for featureToRemove in featuresToRemove:
    del df [ featureToRemove ]

In [328]:
# Move the target variable deal_True to the front

# Assign target variable
column_to_move = df.pop("deal_True")

# Insert column with insert to move the variable
df.insert(0, "deal_True", column_to_move)

# Output head
df.head()

Unnamed: 0,deal_True,askedfor,exchangeforstake,valuation,multEntr_True
0,0,1000000,15,6666667,0
1,1,460000,10,4600000,0
2,1,50000,15,333333,0
3,0,250000,25,1000000,0
4,0,1200000,10,12000000,0


In [329]:
# Join the dummies back to the dataframe
df = df.join (dummies_ind)

# Output head
df.head()

Unnamed: 0,deal_True,askedfor,exchangeforstake,valuation,multEntr_True,ind_Alcoholic Beverages,ind_Automotive,ind_Baby and Child Care,ind_Baby and Children's Apparel and Accessories,ind_Baby and Children's Bedding,...,ind_Specialty Food,ind_Storage and Cleaning Products,ind_Toys and Games,ind_Undergarments and Basics,ind_Water Bottles,ind_Weddings,ind_Wine Accessories,ind_Women's Accessories,ind_Women's Apparel,ind_Women's Shoes
0,0,1000000,15,6666667,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,460000,10,4600000,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1,50000,15,333333,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,250000,25,1000000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1200000,10,12000000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [330]:
# Remove the last column from X
X = df.iloc[:, 1:14]

In [331]:
# Add intercept value for all rows
df ["intercept"] = 1.0

In [332]:
# Remove low variance columns to prevent singular matrix error
from sklearn.feature_selection import VarianceThreshold

def variance_threshold_selector(data, threshold=0.5):
    # https://stackoverflow.com/a/39813304/1956309
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

# min_variance = .9 * (1 - .9)  # You can play here with different values.
min_variance = 0.0001
low_variance = variance_threshold_selector(df, min_variance) 
print('columns removed:')
df.columns ^ low_variance.columns
df.shape
df.shape
X = low_variance

columns removed:


  df.columns ^ low_variance.columns


In [333]:
## Set the y variable
# Create a DataFrame called y with all rows and just the target column.
# Reshape y into a 1-dimensional NumPy array object and set the y variable
y = df.iloc [:, :1].values.reshape (-1, )

# Inspect y
print ("y:", y.shape)
print (type (y))

y: (495,)
<class 'numpy.ndarray'>


In [334]:
## Set the x variable
# Create a df called X with all rows and all columns that follow the target.
#X = df.iloc [:, 1:]

# Inspect X
print ("X Shape:", X.shape)
print (type (X))

X Shape: (495, 59)
<class 'pandas.core.frame.DataFrame'>


In [335]:
# Split into training (70%) and testing (30%) data
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.3, random_state = 10, shuffle = True)

In [336]:
# Create an instance of the StatsModels Logistic Regression Classifier class, passing in y and X
logit = sm.Logit (y, X)

In [337]:
# Fit the data with Logit's fit() method
result = logit.fit (method = 'bfgs')

         Current function value: 0.693147
         Iterations: 0
         Function evaluations: 15
         Gradient evaluations: 3


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


In [338]:
# Output the results summary
result.summary ()

0,1,2,3
Dep. Variable:,y,No. Observations:,495.0
Model:,Logit,Df Residuals:,436.0
Method:,MLE,Df Model:,58.0
Date:,"Sat, 15 Apr 2023",Pseudo R-squ.:,-0.0001443
Time:,19:26:31,Log-Likelihood:,-343.11
converged:,False,LL-Null:,-343.06
Covariance Type:,nonrobust,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
deal_True,0,0.195,0,1.000,-0.382,0.382
askedfor,0,3.5e-07,0,1.000,-6.85e-07,6.85e-07
exchangeforstake,0,0.011,0,1.000,-0.022,0.022
valuation,0,4.61e-08,0,1.000,-9.04e-08,9.04e-08
multEntr_True,0,0.207,0,1.000,-0.406,0.406
ind_Alcoholic Beverages,0,0.971,0,1.000,-1.902,1.902
ind_Automotive,0,0.703,0,1.000,-1.377,1.377
ind_Baby and Child Care,0,0.489,0,1.000,-0.958,0.958
ind_Baby and Children's Apparel and Accessories,0,0.848,0,1.000,-1.662,1.662


In [339]:
# Take exponent of the results
print (np.exp (result.params))

deal_True                                          1.0
askedfor                                           1.0
exchangeforstake                                   1.0
valuation                                          1.0
multEntr_True                                      1.0
ind_Alcoholic Beverages                            1.0
ind_Automotive                                     1.0
ind_Baby and Child Care                            1.0
ind_Baby and Children's Apparel and Accessories    1.0
ind_Baby and Children's Bedding                    1.0
ind_Baby and Children's Entertainment              1.0
ind_Baby and Children's Food                       1.0
ind_Consumer Services                              1.0
ind_Costumes                                       1.0
ind_Cycling                                        1.0
ind_Education                                      1.0
ind_Electronics                                    1.0
ind_Entertainment                                  1.0
ind_Fashio

In [340]:
# Import the Scikit-learn packages for Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Create an instance of the Scikit-learn's LogisticRegression class
logisticRegression = LogisticRegression (C = 1e9, max_iter = 200)

# Remove the intercept column
X = X.iloc [:, :-1]

In [341]:
# Create a 70/30 training/testing data split
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.3, random_state = 10, shuffle = True)

# Fit the model to the training data
logisticRegression.fit (X_train, y_train)

# Output the y-intercept and regression coefficients
print ("Intercept: ", logisticRegression.intercept_)
print ("Coefficients: ", logisticRegression.coef_)

Intercept:  [5.30596614e-13]
Coefficients:  [[ 2.51482431e-11 -3.69370801e-07 -6.71232674e-11  2.83564213e-08
   2.03708049e-12  3.07734342e-13  2.18844959e-13  5.91929221e-13
   2.92791012e-13  0.00000000e+00  9.33479002e-15 -1.42415263e-13
  -7.06377246e-13  1.45907668e-13  4.42324730e-13  3.14164829e-13
  -1.01536467e-13 -2.03697721e-13 -2.77905223e-13  2.95404832e-13
  -1.26054613e-13 -2.72061642e-13  2.94703048e-13  2.92688362e-13
  -1.40910493e-13  2.90004049e-13  7.24796561e-13 -2.76548850e-13
   5.79488714e-13 -1.42415263e-13 -2.82261258e-13  1.60149497e-13
  -1.43873900e-13 -2.86335074e-13  1.48136309e-13 -4.27254549e-13
  -1.38745255e-13  1.44120519e-13  4.23374677e-15 -1.40135216e-13
  -8.10857289e-13 -2.48244277e-13 -7.64383479e-14 -1.38145143e-13
  -2.65992393e-13 -1.41770183e-13  1.17995275e-14  1.52523424e-13
  -7.04419311e-13  1.37669469e-12  1.02170876e-12 -2.70355085e-13
  -1.41099969e-13 -1.43082977e-13 -1.31164889e-13 -2.85678001e-13
  -2.77564215e-13  1.51696899e-1

In [342]:
# Make cross tab for the training data
count_train = pd.crosstab (index = y_train, columns = "Percent")

print ("Training Target Variable:")
print ()
print ((count_train / count_train.sum())*100)
print ()

# Make cross tab for the testing data
count_test = pd.crosstab (index = y_test, columns = "Percent")

print ("Testing Target Variable:")
print ()
print ((count_test / count_test.sum ())*100)

Training Target Variable:

col_0    Percent
row_0           
0      50.289017
1      49.710983

Testing Target Variable:

col_0    Percent
row_0           
0      46.979866
1      53.020134


In [343]:
## Evaluate the accuracy of the model
# Use the predict() method on X_test and store the results in y_test_predictions
y_test_predictions = logisticRegression.predict (X_test)

# Use Scikit-learn's accuracy_score() function to evaluate y_test against y_test_predictions.
print("Accuracy Score: " + f"{accuracy_score (y_test, y_test_predictions):.2f}")

Accuracy Score: 0.48


Given that the accuracy score is 0.48, this means that the model predicts correctly 48% of the time. THis indicates the model does not fit the data very well. This score could perhaps be improved by being fed more data or better data, with more variables that could help explain whether a deal is struck.

In [344]:
# Compare the actual labels to the predicted values
data = { "Actual": y_test, "Predicted": y_test_predictions }
dfCompare = pd.DataFrame (data)

In [345]:
# Output the head for the comparison data
dfCompare.head(10)

Unnamed: 0,Actual,Predicted
0,1,0
1,1,0
2,1,0
3,0,0
4,1,0
5,0,0
6,0,0
7,1,0
8,0,0
9,0,0


In [346]:
# Make and output confusion matrix as a crosstab
pd.crosstab (dfCompare.Predicted, dfCompare.Actual, normalize = "all")

Actual,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.422819,0.47651
1,0.04698,0.053691


The model is extremely pessimistic. It predictes that deal won't be struck 89.93% of the time (0.422819 + 0.476510). It only predicts a deal will be struck 10.07% of the time (0.046980 + 0.053691). Thus, we could guess that a deal won't be struck and would be correct about 90% of the time. It is clear that overall, it is quite unlikely a deal would be struck, as that only happens in the data about 1 in 10 times.