# Introduction to Regression Project-Megaline

# Defining the question

# a) Specifying the data analysis question


Mobile carrier Megaline would like to develop a model that analyzes subscribers' behavior and recommend
one of it's newer plans: Smart or Ultra.

#b) Defining the Metric for Success

We will have accomplished our objective if we can develop a model that will pick the right plan at the highest possible accuracy

#c) Understanding the Context

Mobile carrier Megaline has found out that many of their subscribers use legacy plans.
They want to develop a model that would analyze subscribers' behavior and recommend
one of Megaline's newer plans: Smart or Ultra.

#d)  Recording the Experimental Design

1. Importing libraries
2. Data Importation
3. Data Modeling
4. Model Evaluation
8. Hyparameter Tuning
9. Sanity Check
10. Findings and Recommendations




#e) Data Relevance

The data was relevant for our analysis

# Data importation and modelling

In [None]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score

In [None]:
#importing the data
sub_df =pd.read_csv("https://bit.ly/UsersBehaviourTelco")
sub_df.head()

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0


In [None]:
sub_df.shape

(3214, 5)

In [None]:
sub_df['is_ultra'].value_counts()

0    2229
1     985
Name: is_ultra, dtype: int64

In [None]:
sub_df.dtypes

calls       float64
minutes     float64
messages    float64
mb_used     float64
is_ultra      int64
dtype: object

In [None]:
#converting number of calls and number of messages to integers
sub_df['messages'] = sub_df['messages'].astype(int) 
sub_df['calls'] = sub_df['calls'].astype(int)

In [None]:
sub_df.dtypes

calls         int64
minutes     float64
messages      int64
mb_used     float64
is_ultra      int64
dtype: object

In [None]:
#splitting the data in two sets

from sklearn.model_selection import train_test_split

features = sub_df.drop(['is_ultra'],axis=1) #X
target = sub_df['is_ultra']           #y

features_train, features_test,target_train,target_test = train_test_split(features,target, test_size=0.25, random_state=12345)
features_train, features_valid, target_train, target_valid = train_test_split(features_train, target_train, test_size=0.2, random_state=12345 )

print(features_train.shape)
print(features_valid.shape)
print(target_train.shape)
print(target_valid.shape)


(1928, 4)
(482, 4)
(1928,)
(482,)


# 1.Decision Tree Classifier

In [None]:
#Training the algorithm
model_dtr = DecisionTreeClassifier(random_state=12345)

#fitting the model
model_dtr.fit(features_train,target_train)

#prediction
prediction = model_dtr.predict(features_valid)

#compare precited values with actual values 
sub_check = pd.DataFrame({'Actual': target_valid, 'Predicted': prediction})
print(sub_check.sample(10))

accuracy = accuracy_score(target_valid,prediction)
print('accuracy = ',accuracy)


      Actual  Predicted
514        0          0
1966       0          0
1462       0          0
912        0          0
2506       0          0
1384       1          0
1549       1          0
376        0          0
554        0          0
1990       0          1
accuracy =  0.7468879668049793


The decision tree classifier has an accuracy level of 74.69%

#2.Random Forest Classifier

In [None]:
#Training the algorithm

model_rf = RandomForestClassifier(random_state=12345,n_estimators=3)

#fitting the model
model_rf.fit(features_train,target_train)

#prediction
prediction_rf = model_rf.predict(features_valid)

#compare precited values with actual values 
subrf_check = pd.DataFrame({'Actual': target_valid, 'Predicted': prediction_rf})
print(subrf_check.sample(10))

accuracy = accuracy_score(target_valid,prediction_rf)
print('accuracy = ',accuracy)


      Actual  Predicted
128        0          0
627        0          0
1459       0          0
164        1          1
1709       0          0
2379       0          0
1153       0          0
2118       0          0
231        0          1
1713       1          1
accuracy =  0.7759336099585062


Random forest has an accuracy level of 77.59%

#3.Logistic Regression

In [None]:
#Training the model
model_lr = LogisticRegression(random_state=12345, solver='liblinear')

#fitting the model
model_lr.fit(features_train,target_train)

#prediction
prediction_lr = model_lr.predict(features_valid)

#compare precited values with actual values 
sublr_check = pd.DataFrame({'Actual': target_valid, 'Predicted': prediction_lr})
print(sublr_check.sample(10))

accuracy = accuracy_score(target_valid,prediction_lr)
print('accuracy = ',accuracy)


      Actual  Predicted
10         1          0
2354       0          0
912        0          0
3114       1          0
2765       1          0
1582       1          0
1696       1          0
1089       0          0
1717       0          0
2643       0          0
accuracy =  0.6846473029045643


logistic regression has an accuracy level of 68.46%

# Tuning Hyperparameters

#1.Decision Tree Hyperparameter Tuning

In [None]:
#Tuning max depth
for depth in range(1, 15):
        model = DecisionTreeClassifier(random_state=12345,max_depth=depth)
        
        model.fit(features_train, target_train)

        prediction = model.predict(features_valid)

        accuracy = accuracy_score(target_valid,prediction)

        print("max_depth =", depth, ": ", end='')
        print(accuracy_score(target_valid,prediction))
        

max_depth = 1 : 0.7489626556016598
max_depth = 2 : 0.7759336099585062
max_depth = 3 : 0.7946058091286307
max_depth = 4 : 0.7759336099585062
max_depth = 5 : 0.7904564315352697
max_depth = 6 : 0.7987551867219918
max_depth = 7 : 0.7883817427385892
max_depth = 8 : 0.7904564315352697
max_depth = 9 : 0.7883817427385892
max_depth = 10 : 0.7925311203319502
max_depth = 11 : 0.7925311203319502
max_depth = 12 : 0.7655601659751037
max_depth = 13 : 0.7655601659751037
max_depth = 14 : 0.7531120331950207


max_depth 6 has the highest accuracy of 79.88%

In [None]:
#decision tree regressor with max depth of 6

#Training the algorithm
model_dtr = DecisionTreeClassifier(random_state=12345,max_depth=6)

#fitting the model
model_dtr.fit(features_train,target_train)

#prediction
prediction = model_dtr.predict(features_valid)

accuracy = accuracy_score(target_valid,prediction)
print('accuracy = ',accuracy)

predicted_frequency = pd.Series(prediction).value_counts(normalize=True)
print(predicted_frequency)

actual_frequency = target_valid.value_counts(normalize=True)
print(actual_frequency)

accuracy =  0.7987551867219918
0    0.827801
1    0.172199
dtype: float64
0    0.680498
1    0.319502
Name: is_ultra, dtype: float64


#2.Random Forest Classifier Hyperparameter Tuning

In [None]:
#assuming a max_depth of 6 we tune our random forest

for estimator in range(1,15):
        model_rf = RandomForestClassifier(random_state=12345, max_depth=6, n_estimators=estimator)
        
        model_rf.fit(features_train, target_train)

        prediction_rf = model_rf.predict(features_valid)
        #score(test_features, target)
        #predictions = model.predict(test_features)
        accuracy_test_rf = accuracy_score(target_valid, prediction_rf)
       
        print("estimator =", estimator, ": ", end='')
        print(accuracy_score(target_valid, prediction_rf))

estimator = 1 : 0.7904564315352697
estimator = 2 : 0.7946058091286307
estimator = 3 : 0.8112033195020747
estimator = 4 : 0.8112033195020747
estimator = 5 : 0.8049792531120332
estimator = 6 : 0.8029045643153527
estimator = 7 : 0.8091286307053942
estimator = 8 : 0.8049792531120332
estimator = 9 : 0.8112033195020747
estimator = 10 : 0.8049792531120332
estimator = 11 : 0.8049792531120332
estimator = 12 : 0.8008298755186722
estimator = 13 : 0.8029045643153527
estimator = 14 : 0.7966804979253111


n_estimator 3 gives us accuracy of 81.12%

In [None]:
#Training the algorithm with 3 n_estimators and max_depth 7

model_rf = RandomForestClassifier(random_state=12345,max_depth=7,n_estimators=3)

#fitting the model
model_rf.fit(features_train,target_train)

#prediction
prediction_rf = model_rf.predict(features_valid)

#compare precited values with actual values 
subrf_check = pd.DataFrame({'Actual': target_valid, 'Predicted': prediction_rf})
print(subrf_check.sample(10))

accuracy = accuracy_score(target_valid,prediction_rf)
print('accuracy = ',accuracy)

predicted_frequency = pd.Series(prediction_rf).value_counts(normalize=True)
print(predicted_frequency)

actual_frequency = target_valid.value_counts(normalize=True)
print(actual_frequency)

      Actual  Predicted
1646       1          1
1072       0          0
290        1          0
1775       1          0
3058       0          0
2685       1          0
1229       1          0
2898       0          0
1169       0          0
1061       0          0
accuracy =  0.7925311203319502
0    0.813278
1    0.186722
dtype: float64
0    0.680498
1    0.319502
Name: is_ultra, dtype: float64


#3 Logistic Regression hyperparameter tuning

In [None]:
C_parameter_range = [0.001,0.01,0.1,1,10,100]

for i in C_parameter_range:
    
    # Apply logistic regression model to training data
    
    model_lr = LogisticRegression(random_state=12345, solver='liblinear',C=i) 

    
    model_lr.fit(features_train, target_train)

    # Predict using model
    prediction_lr = model_lr.predict(features_valid)

    accuracy_lr= accuracy_score(target_valid,prediction_lr )

    print("C =",i,": ", end='')
    print(accuracy_score(target_valid,prediction_lr))

C = 0.001 : 0.6784232365145229
C = 0.01 : 0.6846473029045643
C = 0.1 : 0.6846473029045643
C = 1 : 0.6846473029045643
C = 10 : 0.6846473029045643
C = 100 : 0.7178423236514523


the least C with highest accuracy is c=0.01 with 68.46%

In [None]:
#Training the model with c=0.01
model_lr = LogisticRegression(random_state=12345, solver='liblinear',C=0.01)

#fitting the model
model_lr.fit(features_train,target_train)

#prediction
prediction_lr = model_lr.predict(features_valid)

#compare precited values with actual values 
sublr_check = pd.DataFrame({'Actual': target_valid, 'Predicted': prediction_lr})
print(sublr_check.sample(10))

accuracy = accuracy_score(target_valid,prediction_lr)
print('accuracy = ',accuracy)

predicted_frequency = pd.Series(prediction_lr).value_counts(normalize=True)
print(predicted_frequency)

actual_frequency = target_valid.value_counts(normalize=True)
print(actual_frequency)

      Actual  Predicted
2025       0          0
706        0          0
1845       1          0
3114       1          0
695        0          0
2747       0          0
1536       0          0
2797       0          0
1344       0          0
760        1          0
accuracy =  0.6846473029045643
0    0.991701
1    0.008299
dtype: float64
0    0.680498
1    0.319502
Name: is_ultra, dtype: float64


#Model Evaluation

In [None]:
#model evaluation using test data
test_predictdtr = model_dtr.predict(features_test)
dtr_accuracy = accuracy_score(target_test, test_predictdtr)
print('dtr_accuracy:',dtr_accuracy)

test_predictrfc = model_rf.predict(features_test)
rfc_accuracy = accuracy_score(target_test, test_predictrfc)
print('rfc_accuracy:', rfc_accuracy)

test_predictlr = model_lr.predict(features_test)
lr_accuracy = accuracy_score(target_test, test_predictlr)
print('test_predictlr:', lr_accuracy)



dtr_accuracy: 0.7860696517412935
rfc_accuracy: 0.7873134328358209
test_predictlr: 0.7027363184079602


In [None]:
#precision
precision_decision_tree = precision_score(model_dtr.predict(features_test), target_test)
print('precision_decision_tree=',precision_decision_tree)

precision_random_forest = precision_score(model_rf.predict(features_test), target_test)
print('precision_random_forest=',precision_random_forest)

precision_logistic_regression = precision_score(model_lr.predict(features_test), target_test)
print('precision_logistic_regression=',precision_logistic_regression)

precision_decision_tree= 0.3817427385892116
precision_random_forest= 0.4190871369294606
precision_logistic_regression= 0.029045643153526972


The random forest classifier and decision tree classifier surpass the required accuracy of 75% with accuracy score of 78.7 and 78.6. The random forest seems the superior model as it has a precision of 0.41 which though low is higher than the other models. 
