In [3]:
# names of hurricanes
names = ['Cuba I', 'San Felipe II Okeechobee', 'Bahamas', 'Cuba II', 'CubaBrownsville', 'Tampico', 'Labor Day', 'New England', 'Carol', 'Janet', 'Carla', 'Hattie', 'Beulah', 'Camille', 'Edith', 'Anita', 'David', 'Allen', 'Gilbert', 'Hugo', 'Andrew', 'Mitch', 'Isabel', 'Ivan', 'Emily', 'Katrina', 'Rita', 'Wilma', 'Dean', 'Felix', 'Matthew', 'Irma', 'Maria', 'Michael']

# months of hurricanes
months = ['October', 'September', 'September', 'November', 'August', 'September', 'September', 'September', 'September', 'September', 'September', 'October', 'September', 'August', 'September', 'September', 'August', 'August', 'September', 'September', 'August', 'October', 'September', 'September', 'July', 'August', 'September', 'October', 'August', 'September', 'October', 'September', 'September', 'October']

# years of hurricanes
years = [1924, 1928, 1932, 1932, 1933, 1933, 1935, 1938, 1953, 1955, 1961, 1961, 1967, 1969, 1971, 1977, 1979, 1980, 1988, 1989, 1992, 1998, 2003, 2004, 2005, 2005, 2005, 2005, 2007, 2007, 2016, 2017, 2017, 2018]

# maximum sustained winds (mph) of hurricanes
max_sustained_winds = [165, 160, 160, 175, 160, 160, 185, 160, 160, 175, 175, 160, 160, 175, 160, 175, 175, 190, 185, 160, 175, 180, 165, 165, 160, 175, 180, 185, 175, 175, 165, 180, 175, 160]

# areas affected by each hurricane
areas_affected = [['Central America', 'Mexico', 'Cuba', 'Florida', 'The Bahamas'], ['Lesser Antilles', 'The Bahamas', 'United States East Coast', 'Atlantic Canada'], ['The Bahamas', 'Northeastern United States'], ['Lesser Antilles', 'Jamaica', 'Cayman Islands', 'Cuba', 'The Bahamas', 'Bermuda'], ['The Bahamas', 'Cuba', 'Florida', 'Texas', 'Tamaulipas'], ['Jamaica', 'Yucatn Peninsula'], ['The Bahamas', 'Florida', 'Georgia', 'The Carolinas', 'Virginia'], ['Southeastern United States', 'Northeastern United States', 'Southwestern Quebec'], ['Bermuda', 'New England', 'Atlantic Canada'], ['Lesser Antilles', 'Central America'], ['Texas', 'Louisiana', 'Midwestern United States'], ['Central America'], ['The Caribbean', 'Mexico', 'Texas'], ['Cuba', 'United States Gulf Coast'], ['The Caribbean', 'Central America', 'Mexico', 'United States Gulf Coast'], ['Mexico'], ['The Caribbean', 'United States East coast'], ['The Caribbean', 'Yucatn Peninsula', 'Mexico', 'South Texas'], ['Jamaica', 'Venezuela', 'Central America', 'Hispaniola', 'Mexico'], ['The Caribbean', 'United States East Coast'], ['The Bahamas', 'Florida', 'United States Gulf Coast'], ['Central America', 'Yucatn Peninsula', 'South Florida'], ['Greater Antilles', 'Bahamas', 'Eastern United States', 'Ontario'], ['The Caribbean', 'Venezuela', 'United States Gulf Coast'], ['Windward Islands', 'Jamaica', 'Mexico', 'Texas'], ['Bahamas', 'United States Gulf Coast'], ['Cuba', 'United States Gulf Coast'], ['Greater Antilles', 'Central America', 'Florida'], ['The Caribbean', 'Central America'], ['Nicaragua', 'Honduras'], ['Antilles', 'Venezuela', 'Colombia', 'United States East Coast', 'Atlantic Canada'], ['Cape Verde', 'The Caribbean', 'British Virgin Islands', 'U.S. Virgin Islands', 'Cuba', 'Florida'], ['Lesser Antilles', 'Virgin Islands', 'Puerto Rico', 'Dominican Republic', 'Turks and Caicos Islands'], ['Central America', 'United States Gulf Coast (especially Florida Panhandle)']]

# damages (USD($)) of hurricanes
damages = ['Damages not recorded', '100M', 'Damages not recorded', '40M', '27.9M', '5M', 'Damages not recorded', '306M', '2M', '65.8M', '326M', '60.3M', '208M', '1.42B', '25.4M', 'Damages not recorded', '1.54B', '1.24B', '7.1B', '10B', '26.5B', '6.2B', '5.37B', '23.3B', '1.01B', '125B', '12B', '29.4B', '1.76B', '720M', '15.1B', '64.8B', '91.6B', '25.1B']

# deaths for each hurricane
deaths = [90,4000,16,3103,179,184,408,682,5,1023,43,319,688,259,37,11,2068,269,318,107,65,19325,51,124,17,1836,125,87,45,133,603,138,3057,74]

In [4]:
# 1
# Update Recorded Damages
conversion = {"M": 1000000,
              "B": 1000000000}

In [10]:
from collections import Counter, defaultdict

def update_damages(damages):
    return [float(i[:-1])*conversion['M'] if i[-1] == 'M'
            else float(i[:-1])*conversion['B'] if i[-1] == 'B'
            else i for i in damages]

updated_damages = update_damages(damages)

def hurricane_dictionary(names, months, years, max_sustained_winds, areas_affected, damages, deaths):
    return {names[i]:
        {"Name": names[i],
        "Month": months[i],
        "Year": years[i],
        "Max Sustained Wind": max_sustained_winds[i],
        "Areas Affected": areas_affected[i],
        "Damage": damages[i],
        "Deaths": deaths[i]} for i in range(len(names))}

old = hurricane_dictionary(names, months, years, max_sustained_winds, areas_affected, updated_damages, deaths)
new = {v['Year'] : v for v in old.values()}

def affected_areas(old_dict):
    affected_areas = []
    for v in old_dict.values():
        affected_areas.extend(v['Areas Affected'])
    return Counter(affected_areas)

affected = affected_areas(old)

def hurricane_with_highest_mortality(old):
    maxval = max(deaths)
    for key in old:
        if old[key]['Deaths'] == maxval:
            return key, maxval

target_key, target_value = hurricane_with_highest_mortality(old)

def hurricanes_by_mortality(old):
    hurricanes_by_mortality_dictionary = defaultdict(list)
    for key, value in old.items():
        k = 0 if value["Deaths"] == 0 else 1 if value["Deaths"] in range(1, 101) else 2 if value["Deaths"] in range(101, 501)\
        else 3 if value["Deaths"] in range(501, 1001) else 4 if value["Deaths"] in range(1001, 10001) else 5
        hurricanes_by_mortality_dictionary[k] += [{key: value}]
    return hurricanes_by_mortality_dictionary

def greatest_damage_and_cost(old):
    updated_damages_recorded = [i for i in updated_damages if type(i) == float]
    max_value = max(updated_damages_recorded)
    for key in old:
        if old[key]["Damage"] == max_value:
            return key, max_value

hurricane, greatest_damage = greatest_damage_and_cost(old)

def hurricanes_by_damage_rating(old):
    hurricanes_by_damage_rating_dictionary = defaultdict(list)
    for key, value in old.items():
        value_to_check = value["Damage"]
        try:
            value_to_check = int(value_to_check)
            k = 0 if value_to_check in range(0, 10**8)\
            else 1 if value_to_check in range(10**8, 10**9)\
            else 2 if value_to_check in range(10**9, 10**10)\
            else 3 if value_to_check in range(10**10, 5*10**10)\
            else 4
            hurricanes_by_damage_rating_dictionary[k] += [{key: value}]
        except ValueError:
            continue
    return hurricanes_by_damage_rating_dictionary

print(updated_damages)
print(old)
print(new)
print(affected)
print(f"The most affected area due to hurricanes is {affected.most_common(1)[0][0]}.")
print(f"The greatest number of deaths is caused by Hurricane {target_key}, and it caused {target_value} deaths.")
print(hurricanes_by_mortality(old))
print(f"Hurricane {hurricane} caused the greatest damage with a cost of {greatest_damage}$")
print(hurricanes_by_damage_rating(old))

['Damages not recorded', 100000000.0, 'Damages not recorded', 40000000.0, 27900000.0, 5000000.0, 'Damages not recorded', 306000000.0, 2000000.0, 65800000.0, 326000000.0, 60300000.0, 208000000.0, 1420000000.0, 25400000.0, 'Damages not recorded', 1540000000.0, 1240000000.0, 7100000000.0, 10000000000.0, 26500000000.0, 6200000000.0, 5370000000.0, 23300000000.0, 1010000000.0, 125000000000.0, 12000000000.0, 29400000000.0, 1760000000.0, 720000000.0, 15100000000.0, 64800000000.0, 91600000000.0, 25100000000.0]
{'Cuba I': {'Name': 'Cuba I', 'Month': 'October', 'Year': 1924, 'Max Sustained Wind': 165, 'Areas Affected': ['Central America', 'Mexico', 'Cuba', 'Florida', 'The Bahamas'], 'Damage': 'Damages not recorded', 'Deaths': 90}, 'San Felipe II Okeechobee': {'Name': 'San Felipe II Okeechobee', 'Month': 'September', 'Year': 1928, 'Max Sustained Wind': 160, 'Areas Affected': ['Lesser Antilles', 'The Bahamas', 'United States East Coast', 'Atlantic Canada'], 'Damage': 100000000.0, 'Deaths': 4000}, '

defaultdict(list,
            {1: [{'San Felipe II Okeechobee': {'Name': 'San Felipe II Okeechobee',
                'Month': 'September',
                'Year': 1928,
                'Max Sustained Wind': 160,
                'Areas Affected': ['Lesser Antilles',
                 'The Bahamas',
                 'United States East Coast',
                 'Atlantic Canada'],
                'Damage': 100000000.0,
                'Deaths': 4000}},
              {'New England': {'Name': 'New England',
                'Month': 'September',
                'Year': 1938,
                'Max Sustained Wind': 160,
                'Areas Affected': ['Southeastern United States',
                 'Northeastern United States',
                 'Southwestern Quebec'],
                'Damage': 306000000.0,
                'Deaths': 682}},
              {'Carla': {'Name': 'Carla',
                'Month': 'September',
                'Year': 1961,
                'Max Sustained Wind': 175,
    

In [1]:
import warnings
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from my_python.filefinder import helper

In [2]:
warnings.filterwarnings('ignore')

In [None]:
df = helper.stat_display()
data = helper.sklearn_data

In [None]:
print(f'Number of the outcome variable: {len(np.unique(data.target))}.' 
      f'Possible values: {np.unique(data.target)} Labels: {(data.target_names)}')

In [None]:
print(f'Observations: {data.data.shape[0]}, features: {data.feature_names.shape[0]}')

In [None]:
# importing the transformed dataframe from finder 
df.head(3)

In [None]:
# making the columns as features and investigate at the same time whether they have multicollinearity or not (high correlation)
columns = df.columns.to_list()[:10]
columns

In [8]:
features = helper.custom_corr(df)

In [None]:
# importing the helper file
helper.custom_heat(df, features)

In [None]:
# making X and y as predictor(df for the first run) and outcome(one D array) variables to the model
X = df[features]
y = data.target
type(X), type(y)

In [None]:
a, b = np.sum(y == 0), np.sum(y == 1)
print(f'Malingnant tumors number: {a}, benign tumors number: {b}')
print(f'The sample is imbalanced, the benign tumors (true positivity) rate is {b/(b+a):.2}')

In [None]:
min_class_size = min(a, b)
# At a maximum, there should be no more than the smallest class size divided by 10 number of features.
max_features = min_class_size / 10
print(f'Maximum features could be {round(max_features)}, the current number of predictor variables is {len(X.columns)}.'
      f'The statement that the sample is big enough is {max_features > X.columns.nunique()}')

In [None]:
# making a pairplot just for fun
helper.custom_pair(X)

In [None]:
X.head(3)

In [15]:
#np.mean(X['mean radius'])

In [None]:
# standardizing the X values by initalizing the StandardScaler then fit and transform the dataframe (X) back into a 2D array
scaler = StandardScaler()
scaler.fit(X)

In [None]:
X = scaler.transform(X)
X

In [None]:
np.mean(X[:,0])

In [None]:
type(X), type(y), X.shape, y.shape

In [20]:
# splitting the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=50)

In [None]:
c, d, e, f = np.sum(y_train == 1), np.sum(y_train == 0), np.sum(y_test == 1), np.sum(y_test == 0)
f'The train true positivity rate is {c/(c+d):.2}, the test true positivity rate is {e/(e+f):.2}'

In [22]:
# making a LogisticRegression model, fit the training X and y values and then predict y values with using test x values
lrm = LogisticRegression(penalty=None, fit_intercept=True, class_weight='balanced')
model = lrm.fit(X_train, y_train)

In [None]:
threshold = 0.9
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = np.where(y_pred_proba > threshold, 1, 0)
y_test, y_pred

In [None]:
tp, tn, fp, fn = 0, 0, 0, 0

for i in range(len(y_test)):
  if y_test[i] == 1 and y_pred[i] == 1:
    tp += 1
  elif y_test[i] == 0 and y_pred[i] == 0:
    tn += 1
  elif y_test[i] == 0 and y_pred[i] == 1:
    fp += 1
  else:
    fn +=1
print(tp, tn, fp, fn)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
helper.custom_confusion(model, X_test=X_test, y_test=y_test)

### Conclusion: 
``We can see that how using lower prediction threshold changes the results. If our goal is to minimalize the false negatives and the false positives at the same time in a way that their coefficient will be close to 1 that would be a good idea. With this solution we can avoid to mistreat people with malignant tumor while they have benign (False negatives, left bottom corner) or at least decrease the occurences of that, while controlling the false positives (minimalize the number of patitent who has malignant but we predicted benign) can increase the model precision rate which in this case is important. With this in mind the chosen threshold must be somewhere between 0.2 and 0.3 where the precision rate and the recall rate are the closest to each other.``

``This is a malignant-benign breast cancer prediction model based on those predictor features using the LogisticRegression algorithm and showing that the prediction threshold has to be chosen depending on the goals we achieve. If we want to save as many lives as possible we should maximize the precision score (as to minimize the FP ) not the number of False negatives. In this case, we should choose 0.6 as the threshold. If we want to maximalize both the recall and the precision scores at the same time we should choose 0.3 as the prediction threshold.``

In [None]:
helper.custom_roc(model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

In [None]:
scores_and_k = []
accuracies = []

for k in range(1, 101):
  classifier = KNeighborsClassifier(n_neighbors = k)
  classifier.fit(X_train, y_train)
  a = classifier.score(X_test, y_test)
  scores_and_k.append([a, k])
  accuracies.append(a)
print(max(scores_and_k))

In [None]:
helper.k_nearest_neigh(list(range(1, 101)), list_of_accuracy=accuracies, scores_and_k=scores_and_k)

In [None]:
dt = DecisionTreeClassifier(max_depth=3, ccp_alpha=0.01,criterion='gini')
dt.fit(X_train, y_train)

In [30]:
import matplotlib.pyplot as plt
from sklearn import tree

In [None]:
plt.figure(figsize=(20, 12))
tree.plot_tree(dt, feature_names= features, max_depth=5, class_names = ['malignant', 'benign'], label='all', filled=True, rounded=True)
plt.tight_layout()
plt.show()