In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score,explained_variance_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.linear_model import LinearRegression
from math import sqrt
import warnings
warnings.filterwarnings("ignore")
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

#import our scripts that do data science workflow
import wrangle
import split_scale
# import evaluate
import features

### Our scenario continues:

As a customer analyst, I want to know who has spent the most money with us over their lifetime. I have monthly charges and tenure, so I think I will be able to use those two attributes as features to estimate total_charges. I need to do this within an average of $5.00 (RSME) per customer.  So scale X, but not y.

### 1. run all your previous scripts that acquired, prepared, split, and scaled your data.

In [5]:
df = wrangle.wrangle_telco()

In [9]:
train, test = split_scale.split_my_data(df)

In [8]:
test.head()

Unnamed: 0,customer_id,total_charges,monthly_charges,tenure
120,2192-CKRLV,3580.95,49.2,72
1423,7596-IIWYC,538.2,20.25,27
389,6408-WHTEF,6376.55,89.4,72
1137,3070-FNFZQ,478.75,20.4,23
1504,8774-GSBUN,1797.1,25.4,72


In [10]:
X = train[['monthly_charges', 'tenure']]
y = train[['total_charges']]

### 2. select your features using your features script

In [None]:
features.optimal_number_of_features(X_train,y_train)

In [None]:
features_demo.optimal_features(X_train,X_test,y_train,2)

### 3. fit 2 different linear models to your data

In [None]:
predictions=pd.DataFrame({'actual':y_train.total_charges}).reset_index(drop=True)
#model 1
lm1=LinearRegression()
lm1.fit(X_train[['monthly_charges','tenure']],y_train)
lm1_predictions=lm1.predict(X_train[['monthly_charges','tenure']])
predictions['lm1']=lm1_predictions

#model 2
lm2=LinearRegression()
lm2.fit(X_train[['tenure']],y_train)
lm2_predictions=lm2.predict(X_train[['tenure']])
predictions['lm2']=lm2_predictions

#baseline model
predictions['baseline'] = y_train.mean()[0]
predictions.head()
#print(lm1.intercept_,lm1.coef_)

### 4. evaluate the 2 models and your baseline.

In [None]:
MSE_baseline = mean_squared_error(predictions.actual, predictions.baseline)
SSE_baseline = MSE_baseline*len(predictions.actual)
RMSE_baseline = sqrt(MSE_baseline)
r2_baseline = r2_score(predictions.actual, predictions.baseline)
print(MSE_baseline,SSE_baseline,RMSE_baseline,r2_baseline)

In [None]:
MSE_1 = mean_squared_error(predictions.actual, predictions.lm1)
SSE_1 = MSE_1*len(predictions.actual)
RMSE_1 = sqrt(MSE_1)
r2_1 = r2_score(predictions.actual, predictions.lm1)
print(MSE_1,SSE_1,RMSE_1,r2_1)

In [None]:
MSE_2 = mean_squared_error(predictions.actual, predictions.lm2)
SSE_2 = MSE_2*len(predictions.actual)
RMSE_2 = sqrt(MSE_2)
r2_2 = r2_score(predictions.actual, predictions.lm2)
print(MSE_2,SSE_2,RMSE_2,r2_2)

### 5. select the one that performed the best.

The model with monthly charges and tenure performed the best

### 6. apply to your test data

In [None]:
model=lm1.predict(X_test[['monthly_charges','tenure']])
model=model.ravel().reshape(337)
y_test1=np.array(y_test).ravel().reshape(337)
best_model=pd.DataFrame({'predictions':model,'total_charges':y_test1})

best_model.head()

### 7. Write a function that creates the model object, fits and predicts, given X_train, X_test, y_train, y_test

In [None]:
X_train1=X_train[['monthly_charges','tenure']]
X_test1=X_test[['monthly_charges','tenure']]
def modeling_function(X_train,X_test,y_train,y_test):
    predictions_train=pd.DataFrame({'actual':y_train.total_charges}).reset_index(drop=True)
    predictions_test=pd.DataFrame({'actual':y_test.total_charges}).reset_index(drop=True)
    #model 1
    lm1=LinearRegression()
    lm1.fit(X_train,y_train)
    lm1_predictions=lm1.predict(X_train)
    predictions_train['lm1']=lm1_predictions

    #model 2
    lm2=LinearRegression()
    lm2.fit(X_test,y_test)
    lm2_predictions=lm2.predict(X_test)
    predictions_test['lm2']=lm2_predictions
    
    return predictions_train,predictions_test

In [None]:
model_train,model_test=modeling_function(X_train1,X_test1,y_train,y_test)

### 8. Write a function, plot_residuals(x, y, dataframe) that takes the feature, the target, and the dataframe as input and returns a residual plot.

In [None]:
def plot_residuals(x, y):
    '''
    Plots the residuals of a model that uses x to predict y. Note that we don't
    need to make any predictions ourselves here, seaborn will create the model
    and predictions for us under the hood with the `residplot` function.
    '''
    return sns.residplot(x, y)

x=test[['monthly_charges']]
y=test[['total_charges']]
plot_residuals(x,y)

### 9. Write a function, plot_regression(x, y) that takes a feature and a target and returns the datapoints, the regression line, and the confidence interval.  (Hint: Take advantage of things that have already been written)

In [None]:
res = sm.OLS(y, x).fit()

In [None]:
res.summary()

In [None]:
prstd, iv_l, iv_u = wls_prediction_std(res)

fig, ax = plt.subplots(figsize=(8,6))

ax.plot(x, y, 'o', label="data")
#ax.plot(x, y, 'b-', label="True")
ax.plot(x, res.fittedvalues, 'r--.', label="OLS")
ax.plot(x, iv_u, 'g--',label='97.5')
ax.plot(x, iv_l, 'b--',label='2.5')
ax.legend(loc='best');

In [None]:
def plot_regression(x,y):
    res = sm.OLS(y, x).fit()
    prstd, iv_l, iv_u = wls_prediction_std(res)

    fig, ax = plt.subplots(figsize=(8,6))

    ax.plot(x, y, 'o', label="data")
    #ax.plot(x, y, 'b-', label="True")
    ax.plot(x, res.fittedvalues, 'r--.', label="OLS")
    ax.plot(x, iv_u, 'g--',label='97.5% Confidence Level')
    ax.plot(x, iv_l, 'b--',label='2.5% Confidence Level')
    ax.legend(loc='best');
    plt.show()
    