# **FINAL PROJECT REPORT** - **GROUP 8**
# SUBJECT: DATA MINING
# *TOPIC: PREDICTING CANCER MORTALITY IN U.S. COUNTIES*
# List of members:
1. Duong Van Nhat Long - 20521561
2. Vo Đoan To Loan - 20521544
3. Nguyen Thanh Luan - 20521582
4. Dinh Thi Tu Uyen - 20522139

### About dataset: These data were aggregated from a number of sources including the American Community Survey (census.gov), clinicaltrials.gov, and cancer.gov and this dataset have 34 feature and 3047 samples
### Data Source: https://data.world/nrippner/ols-regression-challenge

## Import necessary libraries

In [5]:
# Import to libraries that manipulate datasets and numbers: numpy, pandas, graphing libraries, data visualization: seaborn, matplotlib.
%matplotlib inline
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# To use machine learning algorithms, we import the sklearn library
from sklearn.model_selection import train_test_split

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics


from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.decomposition import PCA

from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

import seaborn as sns



## Import dataset "cancer_reg.csv"

In [6]:

data = pd.read_csv("cancer_reg.csv", encoding='latin-1')

In [7]:
data.head()

In [8]:
# Count number of fearutes and samples
print("Number of features:", data.shape[1])
print("Number of samples :", data.shape[0])

## Survey dataset

This dataset have 34 features and 3047 samples

In [9]:
# View basic information
data.info()

In [10]:
# Statistical statistics of quantitative attributes such as: count the number of values, maximum, minimum, mean, standard deviation, quartiles...
data.describe()

In [11]:
# Print out each data type of each features
dtype = pd.DataFrame(data= {'types': data.dtypes})
dtype

In [12]:
# Count data type
dtype.value_counts()

We see there are 29 features of type float64, 3 features of type int64, 2 features of type object

In [13]:
# Check if dataset have missing values
data.isna().sum()

We see there are 3 features PctSomeCol18_24, PctEmployed16_Over, PctPrivateCoverageAlone have missing values

In [14]:
# Calculate percent of missing values and sample
missing_samples = data.isna().any(axis=1).sum()
percent_missing = (missing_samples / len(data)) * 100

print('Number missing samples:', missing_samples)
print('Percent missing samples:', percent_missing)


We see 2456 samples with missing data and with 80.6%

In [15]:
# Calculate percentage of missing values ​​for each features and total missing values ​​for all features
missing_data = pd.DataFrame({
    'Number missing': data.isna().sum(),
    'Percent %': (data.isna().sum() * 100 / len(data))
})
missing_data.sort_values(by='Percent %', ascending=False, inplace=True)

missing_data_total = pd.DataFrame({
    'Number missing': data.isna().sum(),
    'Percent %': (data.isna().sum() * 100 / data.size)
})

print(missing_data_total.sum())
print(missing_data.head(3))

We see that there are 3046 missing values ​​and account for about 2.94% of the total values
+ The PctSomeCol18_24 property has 2285 and is about 75%
+ The PctPrivateCoverageAlone property has 609 and is about 19%
+ The PctEmployed16_Over property has 152 and is about 5%

In [16]:
plt.figure(figsize=(10, 10))
sns.barplot(x=missing_data.head(3).index, y='Percent %', data=missing_data.head(3))

plt.xticks(rotation='horizontal')
plt.xlabel('Features')
plt.ylabel('Percent')
plt.title('Percent Missing')

plt.show()

We see that PctSomeCol18_24 has quite a large data loss

## DATA PREPROCESSING

Because PctSomeCol18_24 has too much data loss (75%), we will remove it before entering the algorithm

In [17]:
data = data.drop('PctSomeCol18_24', axis=1)

In [18]:
data.shape

### FEATURE ENGINEERING

#### *Feature Creation*

In [19]:
# Extract County and State from Geography
County = []
State = []
for i in range(len(data)):
  County.append(data['Geography'][i][0:data['Geography'][i].find(','):])
  State.append(data['Geography'][i][data['Geography'][i].find(',') + 2::])

data['County'] = County
data['State'] = State

In [20]:
# View columns after splitting
data[['Geography','County','State']].head(10)

##### We need to encoder ['County', 'State'] before give them to sklearn model. But 'County' have high Cardinality so that we should not encoder it

In [21]:
# Encoder 'State' by Mean 'TARGET_deathRate'
## Create dictionary to mapping encoder
dic_map = data[['State', 'TARGET_deathRate']].groupby(by= ['State']).mean().to_dict()['TARGET_deathRate']
dic_map

In [22]:
# Let's replace values in 'State'
data['State_encode'] = data['State'].map(dic_map)
data[['State' ,'State_encode']].sample(10)

#### *Missing Value Handling*
##### After drop PctSomeCol18_24 , we have two features with missing data: PctPrivateCoverageAlone, PctEmployed16_Over . Here we will consider handling

In [23]:
# Before we find the best solution, we will evaluate on the dataset copy
data_copy = data.copy()

In [24]:
X_t = data.copy()
lst = []
sc = 0
for i in ['mean', 'median', 'most_frequent', 'constant']:

  X_o = data[['PctPrivateCoverageAlone','PctEmployed16_Over']]
  y_o = data['TARGET_deathRate']

  imputer_ = SimpleImputer(missing_values=np.nan, strategy=i)
  X_o = pd.DataFrame(imputer_.fit_transform(X_o),columns= X_o.columns)

  X_o_train, X_o_test, y_o_train, y_o_test = train_test_split(X_o, y_o, train_size= 0.85)

  ln_2 = LinearRegression()
  ln_2.fit(X_o_train,y_o_train)

  lst.append(ln_2.score(X_o_test,y_o_test))

  print('Score with '+i+ ' data:', ln_2.score(X_o_test,y_o_test))

X_train, X_test, y_train, y_test = train_test_split(X_t[['PctPrivateCoverageAlone','PctEmployed16_Over']], X_t.TARGET_deathRate, train_size= 0.85)
ln = LinearRegression()
ln.fit(X_train,y_train)

sc = ln.score(X_test,y_test)
lst.append(ln_2.score(X_o_test,y_o_test))
print('Score with mean by State data:', ln.score(X_test,y_test))

X = data[['PctPrivateCoverageAlone','PctEmployed16_Over']]
y = data['TARGET_deathRate']
imputer = KNNImputer(n_neighbors=2, weights="distance")
X = pd.DataFrame(imputer.fit_transform(X), columns= X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= 0.85)

ln = LinearRegression()
ln.fit(X_train,y_train)

print('Score with KNNImputer data:', ln.score(X_test,y_test))

lst.append(ln.score(X_test,y_test))


In [None]:
# Create dictionary to mapping missing values in PctPrivateCoverageAlone by mean group by State
dic_map = data_copy[['State', 'PctPrivateCoverageAlone']].groupby(by= ['State']).mean().to_dict()['PctPrivateCoverageAlone']
dic_map

In [None]:
# Let's fill missing values in PctPrivateCoverageAlone
data_copy['PctPrivateCoverageAlone'] = data_copy.PctPrivateCoverageAlone.fillna(data_copy.State.map(dic_map))

In [None]:
# Create dictionary to mapping missing values in PctEmployed16_Over by mean group by State
dic_map = data_copy[['State', 'PctEmployed16_Over']].groupby(by= ['State']).mean().to_dict()['PctEmployed16_Over']
dic_map

In [None]:
# Let's fill missing values in PctPrivateCoverageAlone
data_copy['PctEmployed16_Over'] = data_copy.PctEmployed16_Over.fillna(data_copy.State.map(dic_map))

In [None]:
# Now calculate percentage of missing values ​​for each features and total missing values ​​for all features
missing_data = pd.DataFrame({
    'Number missing': data_copy.isna().sum(),
    'Percent %': (data_copy.isna().sum() * 100 / len(data_copy))
})
missing_data.sort_values(by='Percent %', ascending=False, inplace=True)

missing_data_total = pd.DataFrame({
    'Number missing': data_copy.isna().sum(),
    'Percent %': (data_copy.isna().sum() * 100 / data_copy.size)
})

print(missing_data_total.sum())
print(missing_data.head(2))