In [1]:
import pandas as pd
import numpy as np
import env
from pydataset import data
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
def get_db_url(dbname) -> str:
    url = 'mysql+pymysql://{}:{}@{}/{}'
    return url.format(env.user, env.password, env.host, dbname)

In [3]:
telco_churn_url = get_db_url('telco_churn')
telco_churn_url

'mysql+pymysql://curie_950:nlkoXnJYc9rCCbIBnZCwUWKiIJtiZpPv@157.230.209.171/telco_churn'

In [4]:
query = '''
SELECT * 
FROM customers
'''

customers = pd.read_sql(query, telco_churn_url)

In [5]:
query = '''
SELECT * 
FROM contract_types
'''

contract_types = pd.read_sql(query, telco_churn_url)

In [6]:
df = customers.merge(contract_types, left_on = 'contract_type_id', right_on = 'contract_type_id')

In [7]:
df = df[["customer_id", "monthly_charges", "tenure", "total_charges", "contract_type"]]



In [8]:
df

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges,contract_type
0,0002-ORFBO,65.60,9,593.3,One year
1,0020-JDNXP,61.25,34,1993.2,One year
2,0022-TCJCI,62.70,45,2791.5,One year
3,0023-UYUPN,25.20,50,1306.3,One year
4,0036-IHMOT,103.70,55,5656.75,One year
...,...,...,...,...,...
7038,9964-WBQDJ,24.40,71,1725.4,Two year
7039,9972-EWRJS,19.25,67,1372.9,Two year
7040,9975-GPKZU,19.75,46,856.5,Two year
7041,9993-LHIEB,67.85,67,4627.65,Two year


In [9]:
assert df.total_charges.isna().sum() == 0, "There are missing values in this column. Handle them before proceeding"
assert df.monthly_charges.isna().sum() == 0, "There are missing values in this column. Handle them before proceeding"
assert df.tenure.isna().sum() == 0, "There are missing values in this column. Handle them before proceeding"
assert df.contract_type.isna().sum() == 0, "There are missing values in this column. Handle them before proceeding"

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 5 columns):
customer_id        7043 non-null object
monthly_charges    7043 non-null float64
tenure             7043 non-null int64
total_charges      7043 non-null object
contract_type      7043 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 330.1+ KB


In [11]:
print(df.isna().sum())
print(df.isnull().sum())

customer_id        0
monthly_charges    0
tenure             0
total_charges      0
contract_type      0
dtype: int64
customer_id        0
monthly_charges    0
tenure             0
total_charges      0
contract_type      0
dtype: int64


In [12]:
df = df[df.contract_type == 'Two year']

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1695 entries, 5348 to 7042
Data columns (total 5 columns):
customer_id        1695 non-null object
monthly_charges    1695 non-null float64
tenure             1695 non-null int64
total_charges      1695 non-null object
contract_type      1695 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 79.5+ KB


In [14]:
df.total_charges = df.total_charges.str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [15]:
# Count the number of empty string entries
df[df.total_charges == ""]

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges,contract_type
5582,1371-DWPAZ,56.05,0,,Two year
5764,2520-SGTTA,20.0,0,,Two year
5801,2775-SEFEE,61.9,0,,Two year
5853,3115-CZMZD,20.25,0,,Two year
5872,3213-VVOLG,25.35,0,,Two year
6026,4075-WKNIU,73.35,0,,Two year
6064,4367-NUYAO,25.75,0,,Two year
6074,4472-LVYGI,52.55,0,,Two year
6289,5709-LVOEQ,80.85,0,,Two year
6641,7644-OMVMY,19.85,0,,Two year


In [16]:
# remove those 11 rows
df = df[df.total_charges != ""]

# set the total_charges column to be a float
df.total_charges = df.total_charges.astype(float)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1685 entries, 5348 to 7042
Data columns (total 5 columns):
customer_id        1685 non-null object
monthly_charges    1685 non-null float64
tenure             1685 non-null int64
total_charges      1685 non-null float64
contract_type      1685 non-null object
dtypes: float64(2), int64(1), object(2)
memory usage: 79.0+ KB


In [18]:
df.contract_type.value_counts()

Two year    1685
Name: contract_type, dtype: int64