<a href="https://colab.research.google.com/github/Jrk373/MachineLearningDemo/blob/main/DecisionTreeClasifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Regeression Tree from start to finish

In [172]:
import pandas as pd  # to load and manipulate data and for One-Hot Encoding
import numpy as np  # to calculate the mean and standard deviation
import matplotlib.pyplot as plt  # to draw graphs
from sklearn.tree import DecisionTreeClassifier  # to build a classification tree
from sklearn.tree import plot_tree  # to draw a classification tree
from sklearn.model_selection import train_test_split  # to split data into training and testing sets
from sklearn.model_selection import cross_val_score  # for cross validation
from sklearn.metrics import ConfusionMatrixDisplay  # creates and draws a confusion matrix


In [173]:
import urllib.request

# Corrected URL with raw content
url = 'https://raw.githubusercontent.com/Jrk373/MachineLearningDemo/main/watson_healthcare_modified.csv'
file_path = 'watson_healthcare_modified.csv'

# Download the file
urllib.request.urlretrieve(url, file_path)

# Load the dataset
try:
    df = pd.read_csv(file_path)
    print('Successfully downloaded', file_path)
    print('Data successfully loaded as data frame "df"')
except pd.errors.ParserError as e:
    print("ParserError encountered:", e)
    print("Attempting to load with alternative options...")
    df = pd.read_csv(file_path, delimiter=',', error_bad_lines=False)
    print("Data loaded with error handling.")


Successfully downloaded watson_healthcare_modified.csv
Data successfully loaded as data frame "df"


Assessing the shape of data helps identify its dimensionality (rows and columns), which is crucial for understanding its structure and determining suitable analysis techniques. It ensures the dataset is in the expected format, enabling error detection and proper preprocessing. Additionally, knowing the data shape aids in resource optimization and selecting the right tools for analysis.

In [174]:
# Import necessary packages
import pandas as pd

# Check the shape (rows, columns)
print('Data set rows and columns:', df.shape)

Data set rows and columns: (1676, 35)


In [175]:
# Import necessary packages
import pandas as pd

# Print off the first 5 rows
print(df.head(5))

   EmployeeID  Age Attrition     BusinessTravel  DailyRate  Department  \
0     1313919   41        No      Travel_Rarely       1102  Cardiology   
1     1200302   49        No  Travel_Frequently        279   Maternity   
2     1060315   37       Yes      Travel_Rarely       1373   Maternity   
3     1272912   33        No  Travel_Frequently       1392   Maternity   
4     1414939   27        No      Travel_Rarely        591   Maternity   

   DistanceFromHome  Education EducationField  EmployeeCount  ...  \
0                 1          2  Life Sciences              1  ...   
1                 8          1  Life Sciences              1  ...   
2                 2          2          Other              1  ...   
3                 3          4  Life Sciences              1  ...   
4                 2          1        Medical              1  ...   

   RelationshipSatisfaction StandardHours  Shift  TotalWorkingYears  \
0                         1            80      0                  8  

## Data Wrangling

### Drop unnecessary columns

In [176]:
# Columns to drop
columns_to_drop = ['EmployeeID',
                   'StandardHours',
                   'Over18',
                   'MonthlyRate',
                   'EmployeeCount']

# Drop those Columns like they're hot
df = df.drop(columns = columns_to_drop)

In [177]:
# Deal with NA values
## Identify Variable with NaN values
def find_columns_with_nan(df):
    columns_with_nan = [col for col in df.columns if df[col].isna().any()]
    return columns_with_nan

### Identify Variable with NaN values
columns_with_nan = find_columns_with_nan(df)

if columns_with_nan:
    print("Columns with NaN values:", columns_with_nan)
else:
    print("There are no NaN values in the dataset.")

There are no NaN values in the dataset.


In [178]:
# Import necessary packages
import pandas as pd

# Lets get more basic information on columns, datatypes etc using .info()
print('Feature Information:')
print(df.info())

Feature Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1676 entries, 0 to 1675
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1676 non-null   int64 
 1   Attrition                 1676 non-null   object
 2   BusinessTravel            1676 non-null   object
 3   DailyRate                 1676 non-null   int64 
 4   Department                1676 non-null   object
 5   DistanceFromHome          1676 non-null   int64 
 6   Education                 1676 non-null   int64 
 7   EducationField            1676 non-null   object
 8   EnvironmentSatisfaction   1676 non-null   int64 
 9   Gender                    1676 non-null   object
 10  HourlyRate                1676 non-null   int64 
 11  JobInvolvement            1676 non-null   int64 
 12  JobLevel                  1676 non-null   int64 
 13  JobRole                   1676 non-null   object
 14  Job

In [179]:
# Identifying missing values in the dataset
missing_values = df.isnull().sum().sort_values(ascending=False)
missing_values = missing_values[missing_values > 0]  # Filter out columns with no missing values

if missing_values.empty:
    print("There are no missing values in the dataset.")
else:
    print("Missing values by column:")
    print(missing_values)

# Replacing missing values
    for column in df.columns:
        if df[column].isnull().any():  # Check if the column has missing values
            if df[column].dtype == 'object':  # For object (categorical) columns
                mode_value = df[column].mode()[0]
                df[column].fillna(mode_value, inplace=True)
                print(f"Missing values in column '{column}' replaced with mode: {mode_value}")
            elif df[column].dtype in ['float64', 'int64']:  # For numeric columns
                mean_value = df[column].mean()
                df[column].fillna(mean_value, inplace=True)
                print(f"Missing values in column '{column}' replaced with mean: {mean_value:.2f}")


There are no missing values in the dataset.


In [181]:
# Split data to Target and Feature
Target = 'Attrition'

# Make a new copy of the columns used to make predictions
X = df.drop(Target, axis=1).copy()  # Alternatively: X = df_no_missing.iloc[:, :-1]

# Make a new copy of the column of data we want to predict
y = df[Target].copy()

# Check the shape of X and y
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

# If the number of samples is different, raise an error
if X.shape[0] != y.shape[0]:  # Use 'X' instead of 'x'
    raise ValueError("X and y must have the same number of samples.")


Shape of X: (1676, 29)
Shape of y: (1676,)


In [182]:
# Import necessary packages
import pandas as pd
import numpy as np

# Identify categorical columns - assuming they are of type 'object'
categorical_columns = X.select_dtypes(include=['object']).columns

## Print the shape to see how many new columns we have before updating
print(f"Shape before encoding: {X.shape}")

# Print unique values for each categorical column before encoding
print("Unique values in categorical columns before one-hot encoding:")
for column in categorical_columns:
    print(f"{column}: {df[column].unique()}")

Shape before encoding: (1676, 29)
Unique values in categorical columns before one-hot encoding:
BusinessTravel: ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
Department: ['Cardiology' 'Maternity' 'Neurology']
EducationField: ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
Gender: ['Female' 'Male']
JobRole: ['Nurse' 'Other' 'Therapist' 'Administrative' 'Admin']
MaritalStatus: ['Single' 'Married' 'Divorced']
OverTime: ['Yes' 'No']


In [183]:
# Import necessary packages
import pandas as pd
import numpy as np

# Apply one-hot encoding to all categorical variables
df_encoded = pd.get_dummies(X)

##### One-hot encoded data set

In [184]:
# Import necessary packages
import pandas as pd
import numpy as np

## Print the shape to see how many new columns were added
print(f"Shape after encoding: {df_encoded.shape}")

# Identify new one-hot encoded columns
new_columns = [col for col in df_encoded.columns if col not in df.columns]

# Print unique values in the new one-hot encoded columns (they should only be 0 or 1)
print("\nUnique values in one-hot encoded columns:")
for column in new_columns:
    print(f"{column}: {df_encoded[column].unique()}")

Shape after encoding: (1676, 46)

Unique values in one-hot encoded columns:
BusinessTravel_Non-Travel: [False  True]
BusinessTravel_Travel_Frequently: [False  True]
BusinessTravel_Travel_Rarely: [ True False]
Department_Cardiology: [ True False]
Department_Maternity: [False  True]
Department_Neurology: [False  True]
EducationField_Human Resources: [False  True]
EducationField_Life Sciences: [ True False]
EducationField_Marketing: [False  True]
EducationField_Medical: [False  True]
EducationField_Other: [False  True]
EducationField_Technical Degree: [False  True]
Gender_Female: [ True False]
Gender_Male: [False  True]
JobRole_Admin: [False  True]
JobRole_Administrative: [False  True]
JobRole_Nurse: [ True False]
JobRole_Other: [False  True]
JobRole_Therapist: [False  True]
MaritalStatus_Divorced: [False  True]
MaritalStatus_Married: [False  True]
MaritalStatus_Single: [ True False]
OverTime_No: [False  True]
OverTime_Yes: [ True False]


In [185]:
# Count of each data type, including 0 counts
data_type_counts = {dtype: (df_encoded.dtypes == dtype).sum() for dtype in ["bool", "category", "datetime64[ns]", "float64", "int64", "object", "str"]}
data_type_counts = {k: v for k, v in sorted(data_type_counts.items())}  # Sort for clarity

# Print the data type counts
print("Data type counts (including 0 values):")
for dtype, count in data_type_counts.items():
    print(f"{dtype}: {count}")

Data type counts (including 0 values):
bool: 24
category: 0
datetime64[ns]: 0
float64: 0
int64: 22
object: 0
str: 0


In [186]:

# Check for boolean columns and convert them to integers
bool_columns = df_encoded.select_dtypes(include=["bool"]).columns
for col in bool_columns:
    df_encoded[col] = df_encoded[col].astype(int)

# Check for object columns in the DataFrame
object_columns = df_encoded.select_dtypes(include=["object"]).columns

# Convert object columns to integers using Label Encoding
for col in object_columns:
    df_encoded[col] = df_encoded[col].astype("category").cat.codes

# Count of each data type, including 0 counts
data_type_counts = {dtype: (df_encoded.dtypes == dtype).sum() for dtype in ["bool", "category", "datetime64[ns]", "float64", "int64", "object"]}
data_type_counts = {k: v for k, v in sorted(data_type_counts.items())}  # Sort for clarity

# Print the data type counts
print("Data type counts (including 0 values):")
for dtype, count in data_type_counts.items():
    print(f"{dtype}: {count}")


Data type counts (including 0 values):
bool: 0
category: 0
datetime64[ns]: 0
float64: 0
int64: 46
object: 0


In [190]:
X = df_encoded

for col in df_encoded.columns:
    print(f"{col}: {X[col].unique()}")

Age: [41 49 37 33 27 32 59 30 38 36 35 29 31 34 28 22 53 24 21 42 44 46 39 43
 50 26 48 55 45 56 23 51 40 54 58 20 25 19 57 52 47 18 60]
DailyRate: [1102  279 1373 1392  591 1005 1324 1358  216 1299  809  153  670 1346
  103 1389  334 1123 1219  371  673 1218  419  391  699 1282 1125  691
  477  705  924 1459  125  895  813 1273  869  890  852 1141  464 1240
 1357  994  721 1360 1065  408 1211 1229  626 1434 1488 1097 1443  515
  853 1142  655 1115  427  653  989 1435 1223  836 1195 1339  664  318
 1225 1328 1082  548  132  746  776  193  397  945 1214  111  573 1153
 1400  541  432  288  669  530  632 1334  638 1093 1217 1353  120  682
  489  807  827  871  665 1040 1420  240 1280  534 1456  658  142 1127
 1031 1189 1354 1467  922  394 1312  750  441  684  249  841  147  528
  594  470  957  542  802 1355 1150 1329  959 1033 1316  364  438  689
  201 1427  857  933 1181 1395  662 1436  194  967 1496 1169 1145  630
  303 1256  440 1450 1452  465  702 1157  602 1480 1268  713  134  526


In [188]:
# Now for target variable
print('Unique y values before encoding:')
y.unique()

Unique y values before encoding:


array(['No', 'Yes'], dtype=object)

In [189]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Get unique values
unique_values = np.unique(y_encoded)
print('Unique y values after encoding:')
print(unique_values)

Unique y values after encoding:
[0 1]


In [191]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=373)

# Confirming the split was successful
if len(X_train) + len(X_test) == len(X) and len(y_train) + len(y_test) == len(y):
    print("Data split successful!")
    print(f"Training set size: {len(X_train)} samples")
    print(f"Testing set size: {len(X_test)} samples")
else:
    print("Data split unsuccessful. Please check your data!")

Data split successful!
Training set size: 1340 samples
Testing set size: 336 samples


In [192]:
from sklearn.tree import DecisionTreeClassifier

# Create a tree model with defaults
clf = DecisionTreeClassifier(
    criterion="gini",  # The function to measure the quality of a split. Options: "gini" for Gini Impurity or "entropy" for Information Gain.
    # splitter="best",  # The strategy used to choose the split at each node. Options: "best" or "random".
    max_depth=None,  # The maximum depth of the tree. None means nodes are expanded until all leaves are pure or contain < min_samples_split samples.
    min_samples_split=2,  # The minimum number of samples required to split an internal node. Must be at least 2.
    # min_samples_leaf=1,  # The minimum number of samples required to be at a leaf node.
    # min_weight_fraction_leaf=0.0,  # The minimum weighted fraction of the sum total of weights required to be at a leaf node.
    # max_features=None,  # The number of features to consider when looking for the best split. None means all features are considered.
    random_state=373,  # Controls the randomness of the estimator. Ensures reproducibility of results.
    # max_leaf_nodes=None,  # Grow the tree with the specified maximum number of leaf nodes. None means unlimited leaf nodes.
    min_impurity_decrease=0.0,  # A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The default value is 0.0, which means that as long as a split reduces the impurity, it will be performed.
    # class_weight=None,  # Weights associated with classes. If None, all classes are weighted equally.
    # ccp_alpha=0.0  # Complexity parameter used for Minimal Cost-Complexity Pruning. A larger value results in a more pruned tree.
)


In [193]:
# Fit (train) the model
clf.fit(X = X_train,
        y = y_train,
        #sample_weight=None,
        #check_input=True
       )