# Assignment 3 for Data Analysis 3

Conducted by Ian Brandenburg

[GitHub Repo](https://github.com/Iandrewburg/DA3_Brandenburg/tree/main/Assignment_3)



Developing a predictive model to identify which small or medium-sized firms in the "Manufacture of computer, electronic, and optical products" industry might fail in 2015, based on their activity in 2014.

[**Data source**](https://osf.io/b2ft9/?view_only=):
Detailed company data from a middle-sized country in the European Union
All registered companies in 2005-2016 in three selected industries 
  (auto manufacturing, equipment manufacturing, hotels and restaurants)
This rich database was constructed for from multiple publicly available sources 
by Bisnode, a business data and anlytics company www.bisnode.com
for educational purposes

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import math
import os
from pathlib import Path
import sys
from plotnine import *
from mizani.formatters import percent_format
import ast
from patsy import dmatrices
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import time

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import partial_dependence
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

# ***1. Data Processing***

## Import Data Directly from GitHub

In [7]:
# DATA IMPORT - FROM GITHUB
data = pd.read_csv('https://github.com/Iandrewburg/DA3_Brandenburg/raw/main/Assignment_3/cs_bisnode_panel.csv')
data.head()

Unnamed: 0,comp_id,begin,end,COGS,amort,curr_assets,curr_liab,extra_exp,extra_inc,extra_profit_loss,...,gender,origin,nace_main,ind2,ind,urban_m,region_m,founded_date,exit_date,labor_avg
0,1001034.0,2005-01-01,2005-12-31,,692.59259,7266.666504,7574.074219,0.0,0.0,0.0,...,mix,Domestic,5630.0,56.0,3.0,1,Central,1990-11-19,,
1,1001034.0,2006-01-01,2006-12-31,,603.703674,13122.222656,12211.111328,0.0,0.0,0.0,...,mix,Domestic,5630.0,56.0,3.0,1,Central,1990-11-19,,
2,1001034.0,2007-01-01,2007-12-31,,425.925934,8196.295898,7800.0,0.0,0.0,0.0,...,mix,Domestic,5630.0,56.0,3.0,1,Central,1990-11-19,,
3,1001034.0,2008-01-01,2008-12-31,,300.0,8485.185547,7781.481445,0.0,0.0,0.0,...,mix,Domestic,5630.0,56.0,3.0,1,Central,1990-11-19,,
4,1001034.0,2009-01-01,2009-12-31,,207.40741,5137.037109,15300.0,0.0,0.0,0.0,...,mix,Domestic,5630.0,56.0,3.0,1,Central,1990-11-19,,0.083333


In [8]:
data.columns

Index(['comp_id', 'begin', 'end', 'COGS', 'amort', 'curr_assets', 'curr_liab',
       'extra_exp', 'extra_inc', 'extra_profit_loss', 'finished_prod',
       'fixed_assets', 'inc_bef_tax', 'intang_assets', 'inventories',
       'liq_assets', 'material_exp', 'net_dom_sales', 'net_exp_sales',
       'personnel_exp', 'profit_loss_year', 'sales', 'share_eq',
       'subscribed_cap', 'tang_assets', 'wages', 'D', 'balsheet_flag',
       'balsheet_length', 'balsheet_notfullyear', 'year', 'founded_year',
       'exit_year', 'ceo_count', 'foreign', 'female', 'birth_year',
       'inoffice_days', 'gender', 'origin', 'nace_main', 'ind2', 'ind',
       'urban_m', 'region_m', 'founded_date', 'exit_date', 'labor_avg'],
      dtype='object')

## Ho

In [9]:
hold_out_sample = data[(data['ind2'] == 26) &
                     (data['sales'] >= 1000) &
                     (data['sales'] <= 10000000) &
                     (data['year'] == 2014)]
hold_out_sample

Unnamed: 0,comp_id,begin,end,COGS,amort,curr_assets,curr_liab,extra_exp,extra_inc,extra_profit_loss,...,gender,origin,nace_main,ind2,ind,urban_m,region_m,founded_date,exit_date,labor_avg
969,6.538183e+06,2014-01-01,2014-12-31,,1018.518494,17022.222656,3040.740723,0.0,0.0,0.0,...,male,Domestic,2630.0,26.0,2.0,2,East,1992-08-25,,
1128,8.416055e+06,2014-01-01,2014-12-31,,425.925934,105740.742188,6918.518555,0.0,0.0,0.0,...,female,Domestic,2651.0,26.0,2.0,1,Central,1995-08-28,,0.083333
1467,1.242838e+07,2014-01-01,2014-12-31,,0.000000,0.000000,9277.777344,0.0,0.0,0.0,...,male,Domestic,2660.0,26.0,2.0,2,Central,1991-06-27,,0.083333
1706,1.777654e+07,2014-01-01,2014-12-31,,48.148148,142296.296875,164237.031250,0.0,0.0,0.0,...,mix,Domestic,2620.0,26.0,2.0,1,Central,1997-07-01,,0.305556
1735,1.862676e+07,2014-01-01,2014-12-31,,5755.555664,95185.187500,45766.667969,0.0,0.0,0.0,...,male,Domestic,2670.0,26.0,2.0,2,West,2010-11-15,,0.159091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286323,4.593723e+11,2014-01-01,2014-12-31,,0.000000,40740.742188,559.259277,0.0,0.0,0.0,...,male,Domestic,2620.0,26.0,2.0,2,East,1993-08-06,,0.083333
286890,4.610112e+11,2014-01-01,2014-12-31,,429.629639,58781.480469,2651.851807,0.0,0.0,0.0,...,male,Domestic,2620.0,26.0,2.0,3,Central,1997-12-10,,0.166667
287204,4.620594e+11,2014-01-01,2014-12-31,,1111.111084,68274.070312,16211.111328,0.0,0.0,0.0,...,male,Domestic,2630.0,26.0,2.0,1,Central,1996-12-30,,0.250000
287272,4.623300e+11,2014-01-01,2014-12-31,,0.000000,17192.591797,7325.925781,0.0,0.0,0.0,...,male,mix,2611.0,26.0,2.0,2,East,2010-12-15,,0.166667


In [10]:
# double check missing values
missing_values_count = hold_out_sample.isnull().sum()

missing_values_table = pd.DataFrame(missing_values_count, columns=['Missing Values'])

missing_values_table

Unnamed: 0,Missing Values
comp_id,0
begin,0
end,0
COGS,948
amort,2
curr_assets,0
curr_liab,0
extra_exp,0
extra_inc,0
extra_profit_loss,0
