In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random as rnd
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE

In [2]:
# Import file
file = "Resources/Incident_data.txt"
df = pd.read_csv(file, encoding="ISO-8859-1", sep='\t')
display(df)

Unnamed: 0,Notes,Leading Cancer Sites,Leading Cancer Sites Code,States,States Code,Year,Year Code,Race,Race Code,Sex,Sex Code,Count,Population
0,,Breast,26000.0,Alabama,1.0,1999.0,1999.0,Asian or Pacific Islander,A-PI,Female,F,18.0,17708
1,,Breast,26000.0,Alabama,1.0,1999.0,1999.0,Black or African American,2054-5,Female,F,597.0,623475
2,,Breast,26000.0,Alabama,1.0,1999.0,1999.0,White,2106-3,Female,F,2360.0,1640665
3,,Breast,26000.0,Alabama,1.0,1999.0,1999.0,White,2106-3,Male,M,19.0,1570643
4,,Breast,26000.0,Alabama,1.0,1999.0,1999.0,Other Races and Unknown combined,2131-1,Female,F,27.0,Not Applicable
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,standard groupings of primary cancer sites.,,,,,,,,,,,,
3949,"8. For the 2005 year, the Census Bureau estima...",,,,,,,,,,,,
3950,Texas due to Hurricanes Katrina and Rita. CDC ...,,,,,,,,,,,,
3951,"these states, nor are these counts included in...",,,,,,,,,,,,


In [3]:
# Drop all columns without any information
df_dropped = df.drop(columns=['Notes','Leading Cancer Sites','Year Code','Sex' ],axis=1)
df_dropped = df_dropped.dropna(how='any')
display(df_dropped)

Unnamed: 0,Leading Cancer Sites Code,States,States Code,Year,Race,Race Code,Sex Code,Count,Population
0,26000.0,Alabama,1.0,1999.0,Asian or Pacific Islander,A-PI,F,18.0,17708
1,26000.0,Alabama,1.0,1999.0,Black or African American,2054-5,F,597.0,623475
2,26000.0,Alabama,1.0,1999.0,White,2106-3,F,2360.0,1640665
3,26000.0,Alabama,1.0,1999.0,White,2106-3,M,19.0,1570643
4,26000.0,Alabama,1.0,1999.0,Other Races and Unknown combined,2131-1,F,27.0,Not Applicable
...,...,...,...,...,...,...,...,...,...
3890,26000.0,Wyoming,56.0,2014.0,White,2106-3,F,400.0,268767
3891,26000.0,Wyoming,56.0,2015.0,White,2106-3,F,327.0,270054
3892,26000.0,Wyoming,56.0,2016.0,White,2106-3,F,397.0,269349
3893,26000.0,Wyoming,56.0,2017.0,White,2106-3,F,356.0,266963


In [4]:
df_no_other_race = df_dropped[df_dropped["Race"]!="Other Races and Unknown combined"]
df_no_other_race

Unnamed: 0,Leading Cancer Sites Code,States,States Code,Year,Race,Race Code,Sex Code,Count,Population
0,26000.0,Alabama,1.0,1999.0,Asian or Pacific Islander,A-PI,F,18.0,17708
1,26000.0,Alabama,1.0,1999.0,Black or African American,2054-5,F,597.0,623475
2,26000.0,Alabama,1.0,1999.0,White,2106-3,F,2360.0,1640665
3,26000.0,Alabama,1.0,1999.0,White,2106-3,M,19.0,1570643
5,26000.0,Alabama,1.0,2000.0,Black or African American,2054-5,F,548.0,627439
...,...,...,...,...,...,...,...,...,...
3890,26000.0,Wyoming,56.0,2014.0,White,2106-3,F,400.0,268767
3891,26000.0,Wyoming,56.0,2015.0,White,2106-3,F,327.0,270054
3892,26000.0,Wyoming,56.0,2016.0,White,2106-3,F,397.0,269349
3893,26000.0,Wyoming,56.0,2017.0,White,2106-3,F,356.0,266963


In [5]:
df_clean = df_no_other_race[df_no_other_race["Sex Code"]!="M"]
df_clean

Unnamed: 0,Leading Cancer Sites Code,States,States Code,Year,Race,Race Code,Sex Code,Count,Population
0,26000.0,Alabama,1.0,1999.0,Asian or Pacific Islander,A-PI,F,18.0,17708
1,26000.0,Alabama,1.0,1999.0,Black or African American,2054-5,F,597.0,623475
2,26000.0,Alabama,1.0,1999.0,White,2106-3,F,2360.0,1640665
5,26000.0,Alabama,1.0,2000.0,Black or African American,2054-5,F,548.0,627439
6,26000.0,Alabama,1.0,2000.0,White,2106-3,F,2366.0,1643761
...,...,...,...,...,...,...,...,...,...
3890,26000.0,Wyoming,56.0,2014.0,White,2106-3,F,400.0,268767
3891,26000.0,Wyoming,56.0,2015.0,White,2106-3,F,327.0,270054
3892,26000.0,Wyoming,56.0,2016.0,White,2106-3,F,397.0,269349
3893,26000.0,Wyoming,56.0,2017.0,White,2106-3,F,356.0,266963


In [6]:
df_Black = df_clean[df_clean["Race"]=="Black or African American"]
df_Black

Unnamed: 0,Leading Cancer Sites Code,States,States Code,Year,Race,Race Code,Sex Code,Count,Population
1,26000.0,Alabama,1.0,1999.0,Black or African American,2054-5,F,597.0,623475
5,26000.0,Alabama,1.0,2000.0,Black or African American,2054-5,F,548.0,627439
10,26000.0,Alabama,1.0,2001.0,Black or African American,2054-5,F,617.0,630901
12,26000.0,Alabama,1.0,2002.0,Black or African American,2054-5,F,603.0,633374
16,26000.0,Alabama,1.0,2003.0,Black or African American,2054-5,F,643.0,636876
...,...,...,...,...,...,...,...,...,...
3848,26000.0,Wisconsin,55.0,2014.0,Black or African American,2054-5,F,218.0,209511
3854,26000.0,Wisconsin,55.0,2015.0,Black or African American,2054-5,F,238.0,211201
3860,26000.0,Wisconsin,55.0,2016.0,Black or African American,2054-5,F,237.0,212928
3866,26000.0,Wisconsin,55.0,2017.0,Black or African American,2054-5,F,216.0,214873


In [7]:
df_Black_clean = df_Black.drop(columns=['Leading Cancer Sites','States','States Code','Year', 'Race Code','Sex Code' ],axis=1)
df_Black_clean

KeyError: "['Leading Cancer Sites'] not found in axis"

In [None]:

from pathlib import Path
from sklearn.linear_model import LinearRegression

plt.scatter(df_Black_clean.Population, df_Black_clean.Count)
#plt.xticks([0, 50000, 150000, 250000, 350000, 550000, 750000, 950000])
plt.xlabel('Population')
plt.ylabel('Count')
plt.show()

In [None]:
df_Black_clean.info()

In [None]:
df_Black_clean['Population'] = df_Black_clean['Population'].astype(float)

In [None]:
df_Black_clean.dtypes

In [None]:
plt.scatter(df_Black_clean.Population, df_Black_clean.Count)
#plt.xticks([0, 50000, 150000, 250000, 350000, 550000, 750000, 950000])
plt.xlabel('Population')
plt.ylabel('Count')
plt.show()

In [None]:
X = df_Black_clean.Population.values.reshape(-1, 1)

In [None]:
X.shape

In [None]:
y = df_Black_clean.Count

In [None]:
model = LinearRegression()

In [None]:
model.fit(X, y)

In [None]:
y_pred = model.predict(X)
print(y_pred.shape)

In [None]:
plt.scatter(X, y)
plt.plot(X, y_pred, color='red')
plt.show()

In [None]:
print(model.coef_)
print(model.intercept_)