In [57]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression

In [62]:
# Load the dataset
df= pd.read_csv('HepC.csv')


In [63]:
# Display the first few rows of the dataset
df.head()


Unnamed: 0,Category,Age,Sex,ALB,ALT,AST,BIL,CHE,CREA,GGT,PROT
0,0=Blood Donor,32,m,38.5,7.7,22.1,7.5,6.93,106.0,12.1,69.0
1,0=Blood Donor,32,m,38.5,18.0,24.7,3.9,11.17,74.0,15.6,76.5
2,0=Blood Donor,32,m,46.9,36.2,52.6,6.1,8.84,86.0,33.2,79.3
3,0=Blood Donor,32,m,43.2,30.6,22.6,18.9,7.33,80.0,33.8,75.7
4,0=Blood Donor,32,m,39.2,32.6,24.8,9.6,9.15,76.0,29.9,68.7


In [64]:
# Display basic information about the dataset
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612 entries, 0 to 611
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Category  612 non-null    object 
 1   Age       612 non-null    int64  
 2   Sex       612 non-null    object 
 3   ALB       612 non-null    float64
 4   ALT       612 non-null    float64
 5   AST       612 non-null    float64
 6   BIL       612 non-null    float64
 7   CHE       612 non-null    float64
 8   CREA      612 non-null    float64
 9   GGT       612 non-null    float64
 10  PROT      612 non-null    float64
dtypes: float64(8), int64(1), object(2)
memory usage: 52.7+ KB


In [23]:
# Check for missing values
df.isnull().sum() # Missing values were addressed in EDA portion of project

Category    0
Age         0
Sex         0
ALB         0
ALT         0
AST         0
BIL         0
CHE         0
CREA        0
GGT         0
PROT        0
dtype: int64

In [25]:
# Visualizing unique values in the object columns that will need to be transformed into dummy variables
print(df['Category'].unique()) # whether a person is Hep C - or +; if + they are categoriezed into grades of liver damage 1-3

print(df['Sex'].unique())

['0=Blood Donor' '0s=suspect Blood Donor' '1=Hepatitis' '2=Fibrosis'
 '3=Cirrhosis']
['m' 'f']


In [65]:
# Create a column that will be 0 for hep c (-) or 1 for hep c (+)
df['Status'] = df['Category'].apply(lambda x: 0 if x in ['0=Blood Donor', '0s=suspect Blood Donor'] else 1)


In [66]:
df.head() # Verification that Status column was added

Unnamed: 0,Category,Age,Sex,ALB,ALT,AST,BIL,CHE,CREA,GGT,PROT,Status
0,0=Blood Donor,32,m,38.5,7.7,22.1,7.5,6.93,106.0,12.1,69.0,0
1,0=Blood Donor,32,m,38.5,18.0,24.7,3.9,11.17,74.0,15.6,76.5,0
2,0=Blood Donor,32,m,46.9,36.2,52.6,6.1,8.84,86.0,33.2,79.3,0
3,0=Blood Donor,32,m,43.2,30.6,22.6,18.9,7.33,80.0,33.8,75.7,0
4,0=Blood Donor,32,m,39.2,32.6,24.8,9.6,9.15,76.0,29.9,68.7,0


In [67]:
# Convert categorical columns to dummy variables
df_dummies = pd.get_dummies(df,columns=['Category', 'Sex'])


In [68]:
# Display the first few rows of the new dataframe with dummy variables
df_dummies.head()

Unnamed: 0,Age,ALB,ALT,AST,BIL,CHE,CREA,GGT,PROT,Status,Category_0=Blood Donor,Category_0s=suspect Blood Donor,Category_1=Hepatitis,Category_2=Fibrosis,Category_3=Cirrhosis,Sex_f,Sex_m
0,32,38.5,7.7,22.1,7.5,6.93,106.0,12.1,69.0,0,True,False,False,False,False,False,True
1,32,38.5,18.0,24.7,3.9,11.17,74.0,15.6,76.5,0,True,False,False,False,False,False,True
2,32,46.9,36.2,52.6,6.1,8.84,86.0,33.2,79.3,0,True,False,False,False,False,False,True
3,32,43.2,30.6,22.6,18.9,7.33,80.0,33.8,75.7,0,True,False,False,False,False,False,True
4,32,39.2,32.6,24.8,9.6,9.15,76.0,29.9,68.7,0,True,False,False,False,False,False,True


### Splitting data -target variable "Category"

In [69]:
# Separate features and target 
X = df_dummies.drop(columns=['Category_0=Blood Donor', 'Category_0s=suspect Blood Donor', 'Category_1=Hepatitis', 'Category_2=Fibrosis', 'Category_3=Cirrhosis'])
y = df_dummies[['Category_0=Blood Donor', 'Category_0s=suspect Blood Donor', 'Category_1=Hepatitis', 'Category_2=Fibrosis', 'Category_3=Cirrhosis']]


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

