In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, train_test_split
from sklearn.linear_model import Lasso, Ridge, LinearRegression, LogisticRegression
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.metrics import mean_squared_error, accuracy_score, recall_score, precision_score, f1_score
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv('../data/bigml_59c28831336c6604c800002a.csv')

In [None]:
 class ModelWithCV(): #class acquired through Flatiron School
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
len(df['state'].value_counts()) ###???

In [None]:
df['state'].value_counts().sort_index()  # 51 because including DC??? are they refering to the District of Columbia

In [None]:
df.churn.value_counts(), df.churn.value_counts(normalize=True)

In [None]:
df.churn.value_counts(normalize=True)

In [None]:
df['area code'].value_counts(normalize=False), df['area code'].value_counts(normalize=True),

In [None]:
df.head(1)

In [None]:
df.columns

In [None]:
columns = list(df.columns)

In [None]:
[{(x, (len(df[x].value_counts()))): [df[x].value_counts()]} for x in columns]

In [None]:
df['international plan'] = df['international plan'].str.replace('yes','1') #replacing yes with 1
df['international plan'] = df['international plan'].str.replace('no','0') #replacing no with 0
df['international plan'].value_counts()

In [None]:
df['voice mail plan'] = df['voice mail plan'].str.replace('yes','1') #replacing yes with 1
df['voice mail plan'] = df['voice mail plan'].str.replace('no','0') #replacing no with 0
df['voice mail plan'].value_counts()

In [None]:
df.loc[df['churn'] == True, 'churn'] = 1
df.loc[df['churn'] == False, 'churn'] = 0
df['churn'].value_counts()

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.drop(columns='phone number', inplace=True)

In [None]:
df.info()

In [None]:
np.where(df.applymap(lambda x: x == ''))

In [None]:
df = df.replace(' ', np.nan)  

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
df[['state','area code']]

In [None]:
cat_cols = df[['state','area code']]

ohe = OneHotEncoder(
    drop='first',
    sparse=False)

dums = ohe.fit_transform(cat_cols)
dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names(),
                       index=cat_cols.index)

In [None]:
df_clean = pd.concat([df, dums_df], axis=1)
df_clean.drop(columns=cat_cols, inplace=True)
df_clean

In [None]:
# X_train_cat = X_train.select_dtypes('object')

# ohe = OneHotEncoder(
#     drop='first',
#     sparse=False)

# dums = ohe.fit_transform(X_train_cat)
# dums_df = pd.DataFrame(dums,
#                        columns=ohe.get_feature_names(),
#                        index=X_train_cat.index)
# X_train_nums = X_train.select_dtypes('float64')

# ss = StandardScaler()

# ss.fit(X_train_nums)
# nums_df = pd.DataFrame(ss.transform(X_train_nums),
#                       index=X_train_nums.index)
# X_train_clean = pd.concat([nums_df, dums_df], axis=1)

In [None]:
X = df_clean.drop(columns='churn')
y = df_clean.churn

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=3)

In [None]:
df.churn.value_counts()

In [None]:
dr = DummyClassifier()

dummy_model = dr.fit(X_train, y_train)
dummy_train =dr.score(X_train,y_train)
dummy_test = dr.score(X_test,y_test)

print(f'Train: {dummy_train}')
print(f'Test : {dummy_test}')

In [None]:
sm = SMOTE(sampling_strategy='minority', random_state=3)
X_resmp, y_resmp = sm.fit_resample(X_train, y_train)

y_resmp.value_counts()

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

In [None]:
rfc = RandomForestClassifier(max_depth=5)
rfc.fit(X_train,y_train)
forest = ModelWithCV(rfc, 'forest_pipe', X_train, y_train)

In [None]:
rfc.score(X_train,y_train) , forest.cv_results