In [2]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# plt.style.use('classic')

# Modeling
import statsmodels.api as sm

from scipy.stats import pearsonr

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error

In [3]:
import pandas as pd
from env import user, host, pw

def get_connection(db, user, host, password):
    from sqlalchemy import create_engine
    url = f'mysql+pymysql://{user}:{password}@{host}/{db}'
    return create_engine(url)

conn = get_connection('telco_churn', user, host, pw)

df = pd.read_sql('SELECT c.customer_id, c.tenure, c.monthly_charges, \
                     c.total_charges, contract_types.contract_type \
                    FROM customers c \
                    JOIN contract_types USING(contract_type_id) \
                    WHERE contract_type = "Two year";',
                 conn)

In [4]:
df.head()

Unnamed: 0,customer_id,tenure,monthly_charges,total_charges,contract_type
0,0013-SMEOE,71,109.7,7904.25,Two year
1,0014-BMAQU,63,84.65,5377.8,Two year
2,0016-QLJIS,65,90.45,5957.9,Two year
3,0017-DINOC,54,45.2,2460.55,Two year
4,0017-IUDMW,72,116.8,8456.75,Two year


In [5]:
df.shape[0]
# df.info()

1695

In [6]:
df['customer_id'].dtype

dtype('O')

In [7]:
df.tail(10)

Unnamed: 0,customer_id,tenure,monthly_charges,total_charges,contract_type
1685,9945-PSVIP,25,18.7,383.65,Two year
1686,9950-MTGYX,28,20.3,487.95,Two year
1687,9953-ZMKSM,63,25.25,1559.3,Two year
1688,9958-MEKUC,72,103.95,7517.7,Two year
1689,9959-WOFKT,71,106.7,7382.25,Two year
1690,9964-WBQDJ,71,24.4,1725.4,Two year
1691,9972-EWRJS,67,19.25,1372.9,Two year
1692,9975-GPKZU,46,19.75,856.5,Two year
1693,9993-LHIEB,67,67.85,4627.65,Two year
1694,9995-HOTOH,63,59.0,3707.6,Two year


In [8]:
df.describe().loc["75%","tenure"]-df.describe().loc["25%","tenure"]

23.0

In [9]:
df.describe().quantile(.75)-df.describe().quantile(.25)

tenure             30.697659
monthly_charges    65.509601
dtype: float64

In [10]:
summary_df = df.describe()

In [11]:
summary_df

Unnamed: 0,tenure,monthly_charges
count,1695.0,1695.0
mean,56.735103,60.770413
std,18.209363,34.678865
min,0.0,18.4
25%,48.0,24.025
50%,64.0,64.35
75%,71.0,90.45
max,72.0,118.75


In [12]:
df.tenure.quantile(.75)-df.tenure.quantile(.25)

23.0

In [13]:
from scipy.stats import iqr

In [14]:
iqr(df.tenure)

23.0

In [15]:
df.isnull().sum()

customer_id        0
tenure             0
monthly_charges    0
total_charges      0
contract_type      0
dtype: int64

In [16]:
df.sort_values(['total_charges'], ascending=True)
# df.total_charges.value_counts()

# df2 = df.convert_objects(convert_numeric=True)
# df2.isnull().sum()

df = df['total_charges'].replace(r'\s+', np.nan, regex=True)
df.isnull().sum()

10

In [17]:
# df['total_charges'].apply(lambda x: 0 if x == ' ' else x)

In [18]:
df = df.fillna(0)
df.isnull().sum()

0

In [None]:
df.hist(column='monthly_charges', bins=20)

In [None]:
sns.