This notebook is mostly based on https://www.kaggle.com/kashnitsky/topic-1-exploratory-data-analysis-with-pandas


In [1]:
# Load libraries:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

ModuleNotFoundError: No module named 'pandas'

# Ingest data:

In [2]:
# Read a data set from a CSV file:
df = pd.read_csv('../input/telecom_churn.csv')
# CSV is a simple text-based comma-separated format
# First few lines of the above file are:
#
# State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
# KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
# OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False



pandas can also handle other file formats and data sources

example: Excel (.xls, .xslx) - pandas.read_excel(path, sheet_name, ... )
- see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html

example: SQL - pandas.read_sql(sql,con, ... )
- see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html#pandas.read_sql

Many other possible sources - see https://pandas.pydata.org/pandas-docs/stable/reference/io.html



# Get some quick info about the data:

In [3]:
# A quick look at the first few rows, nicely formatted:
df.head()
# The 2D DataFrame object
# - rows = examples, columns = features
# Rows and columns have indexes
# - in this case, rows are indexed by number, column by name

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [4]:
# Indexing by a column name gives a one-dimensional Series:
# (much more on indexing later)
df.head()['State']

0    KS
1    OH
2    NJ
3    OH
4    OK
Name: State, dtype: object

In [5]:
# Basic info about the dataset:
print('Number of rows, columns', df.shape)
print('Columns (features):')
print(' ',df.columns)

Number of rows, columns (3333, 20)
Columns (features):
  Index(['State', 'Account length', 'Area code', 'International plan',
       'Voice mail plan', 'Number vmail messages', 'Total day minutes',
       'Total day calls', 'Total day charge', 'Total eve minutes',
       'Total eve calls', 'Total eve charge', 'Total night minutes',
       'Total night calls', 'Total night charge', 'Total intl minutes',
       'Total intl calls', 'Total intl charge', 'Customer service calls',
       'Churn'],
      dtype='object')


In [6]:
# Some more details
# - type of each feature
# - how many missing values for each feature?
# - memory usage
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
State                     3333 non-null object
Account length            3333 non-null int64
Area code                 3333 non-null int64
International plan        3333 non-null object
Voice mail plan           3333 non-null object
Number vmail messages     3333 non-null int64
Total day minutes         3333 non-null float64
Total day calls           3333 non-null int64
Total day charge          3333 non-null float64
Total eve minutes         3333 non-null float64
Total eve calls           3333 non-null int64
Total eve charge          3333 non-null float64
Total night minutes       3333 non-null float64
Total night calls         3333 non-null int64
Total night charge        3333 non-null float64
Total intl minutes        3333 non-null float64
Total intl calls          3333 non-null int64
Total intl charge         3333 non-null float64
Customer service calls    3333 non-null int64


# Select parts of the DataFrame by row/column:

In [7]:
# Get a section of rows by row number:
dfSmall = df[0:6]
dfSmall

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
5,AL,118,510,Yes,No,0,223.4,98,37.98,220.6,101,18.75,203.9,118,9.18,6.3,6,1.7,0,False


In [8]:
# get one column (as a one-dimensional Series):
dfSmall['International plan']

0     No
1     No
2     No
3    Yes
4    Yes
5    Yes
Name: International plan, dtype: object

In [9]:
# get one value:
dfSmall['International plan'][0]

'No'

In [10]:
# Selecting specified rows and columns by index:
dfSmall.loc[:4,['State','Voice mail plan']]

Unnamed: 0,State,Voice mail plan
0,KS,Yes
1,OH,Yes
2,NJ,No
3,OH,No
4,OK,No


In [11]:
# Selecting specified rows and columns by index:
dfSmall.loc[:4,'State':'Voice mail plan']

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan
0,KS,128,415,No,Yes
1,OH,107,415,No,Yes
2,NJ,137,415,No,No
3,OH,84,408,Yes,No
4,OK,75,415,Yes,No


In [12]:
# Selecting specified rows and columns by number (position)
df.iloc[[1,4,5],1:4]

Unnamed: 0,Account length,Area code,International plan
1,107,415,No
4,75,415,Yes
5,118,510,Yes


# Select rows of a DataFrame by boolean condition:

First, compute a Series that contains the condition's value for each row:

In [13]:
# compute a simple boolean condition on rows of a DataFrame:
dfSmall['International plan']=='Yes'

0    False
1    False
2    False
3     True
4     True
5     True
Name: International plan, dtype: bool

In [14]:
# compute a more complicated boolean condition on rows of a DataFrame:
(dfSmall['International plan']=='Yes') & (dfSmall['State']=='OH')

0    False
1    False
2    False
3     True
4    False
5    False
dtype: bool

In [15]:
# compute a really complicated boolean condition on rows of a DataFrame:
dfSmall.apply(lambda row : row['State'].startswith('O')
                              and row['International plan']=='Yes',
              axis=1)

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

Once you have the boolean Series, you can simply index the DataFrame using it:

This selects the rows with a true boolean value.

In [16]:
dfSmall[dfSmall['International plan']=='Yes']

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
5,AL,118,510,Yes,No,0,223.4,98,37.98,220.6,101,18.75,203.9,118,9.18,6.3,6,1.7,0,False


# More detailed stats on a DataFrame:

In [17]:
# Summary stats on numeric features:
df.describe()

Unnamed: 0,Account length,Area code,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,101.064806,437.182418,8.09901,179.775098,100.435644,30.562307,200.980348,100.114311,17.08354,200.872037,100.107711,9.039325,10.237294,4.479448,2.764581,1.562856
std,39.822106,42.37129,13.688365,54.467389,20.069084,9.259435,50.713844,19.922625,4.310668,50.573847,19.568609,2.275873,2.79184,2.461214,0.753773,1.315491
min,1.0,408.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.2,33.0,1.04,0.0,0.0,0.0,0.0
25%,74.0,408.0,0.0,143.7,87.0,24.43,166.6,87.0,14.16,167.0,87.0,7.52,8.5,3.0,2.3,1.0
50%,101.0,415.0,0.0,179.4,101.0,30.5,201.4,100.0,17.12,201.2,100.0,9.05,10.3,4.0,2.78,1.0
75%,127.0,510.0,20.0,216.4,114.0,36.79,235.3,114.0,20.0,235.3,113.0,10.59,12.1,6.0,3.27,2.0
max,243.0,510.0,51.0,350.8,165.0,59.64,363.7,170.0,30.91,395.0,175.0,17.77,20.0,20.0,5.4,9.0


In [18]:
# Summary stats on categorical features:
df.describe(include=['object','bool'])

Unnamed: 0,State,International plan,Voice mail plan,Churn
count,3333,3333,3333,3333
unique,51,2,2,2
top,WV,No,No,False
freq,106,3010,2411,2850


In [19]:
# describe returns a DataFrame!
df.describe().loc[['mean','std'],['Account length','Total night calls']]

Unnamed: 0,Account length,Total night calls
mean,101.064806,100.107711
std,39.822106,19.568609


In [20]:
# individual stat functions on a DataFrame - returns a Series:
df.mean()

Account length            101.064806
Area code                 437.182418
Number vmail messages       8.099010
Total day minutes         179.775098
Total day calls           100.435644
Total day charge           30.562307
Total eve minutes         200.980348
Total eve calls           100.114311
Total eve charge           17.083540
Total night minutes       200.872037
Total night calls         100.107711
Total night charge          9.039325
Total intl minutes         10.237294
Total intl calls            4.479448
Total intl charge           2.764581
Customer service calls      1.562856
Churn                       0.144914
dtype: float64

In [21]:
# individual stat functions on a Series - returns a value:
df['Account length'].mean()

101.06480648064806

In [22]:
# value counts for a categorical feature - returns a Series:
df['State'].value_counts()

WV    106
MN     84
NY     83
AL     80
OR     78
OH     78
WI     78
WY     77
VA     77
CT     74
VT     73
MI     73
ID     73
UT     72
TX     72
IN     71
KS     70
MD     70
NJ     68
MT     68
NC     68
CO     66
WA     66
NV     66
MA     65
RI     65
MS     65
AZ     64
MO     63
FL     63
ME     62
NM     62
ND     62
OK     61
DE     61
NE     61
SC     60
SD     60
KY     59
IL     58
NH     56
AR     55
GA     54
DC     54
TN     53
HI     53
AK     52
LA     51
PA     45
IA     44
CA     34
Name: State, dtype: int64

In [23]:
df['State'].value_counts(normalize=True).loc[['NY','DC']]

NY    0.024902
DC    0.016202
Name: State, dtype: float64

In [24]:
df['State'].value_counts(normalize=True)['NY']

0.024902490249024904

# Starting to look at the data in more complex ways:

In [25]:
# Summary stats on numeric features for those with an international plan:
df[df['International plan']=='Yes'].describe()

Unnamed: 0,Account length,Area code,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls
count,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0
mean,104.071207,443.4613,8.464396,187.986997,100.665635,31.95839,203.936842,100.486068,17.334923,196.410217,100.851393,8.838483,10.628173,4.609907,2.869907,1.464396
std,38.363388,45.355198,14.049639,56.850451,21.050065,9.664671,52.879435,18.285239,4.494749,52.333632,19.386144,2.355342,2.697787,2.629768,0.728151,1.337863
min,2.0,408.0,0.0,12.5,42.0,2.13,60.8,50.0,5.17,72.4,48.0,3.26,1.3,1.0,0.35,0.0
25%,79.5,415.0,0.0,148.2,85.5,25.19,167.75,88.0,14.255,156.5,88.0,7.04,9.0,3.0,2.43,0.0
50%,104.0,415.0,0.0,188.9,103.0,32.11,202.9,100.0,17.25,196.2,100.0,8.83,10.8,4.0,2.92,1.0
75%,132.5,510.0,20.0,228.65,117.0,38.87,239.3,114.0,20.345,232.8,114.0,10.475,12.2,6.0,3.29,2.0
max,224.0,510.0,45.0,346.8,146.0,58.96,363.7,159.0,30.91,352.5,154.0,15.86,20.0,20.0,5.4,9.0


In [26]:
# Churn percentage for those living in NY or NJ:
df[(df['State']=='NY') | (df['State']=='NJ')]['Churn'].value_counts(normalize=True)

False    0.781457
True     0.218543
Name: Churn, dtype: float64

In [27]:
# Churn percentage by state:
df.groupby(['State'])['Churn'].agg([np.mean])
# can also use multiple features in each of the above lists

Unnamed: 0_level_0,mean
State,Unnamed: 1_level_1
AK,0.057692
AL,0.1
AR,0.2
AZ,0.0625
CA,0.264706
CO,0.136364
CT,0.162162
DC,0.092593
DE,0.147541
FL,0.126984


In [28]:
# Churn percentage by state, sorted in increasing order:
df.groupby(['State'])['Churn'].agg([np.mean,len]).sort_values(by='mean')


Unnamed: 0_level_0,mean,len
State,Unnamed: 1_level_1,Unnamed: 2_level_1
HI,0.056604,53
AK,0.057692,52
AZ,0.0625,64
VA,0.064935,77
IA,0.068182,44
LA,0.078431,51
NE,0.081967,61
IL,0.086207,58
WI,0.089744,78
RI,0.092308,65


In [29]:
# Another way to get churn by state:
pd.crosstab(df['Churn'],df['State'])

State,AK,AL,AR,AZ,CA,CO,CT,DC,DE,FL,GA,HI,IA,ID,IL,IN,KS,KY,LA,MA,MD,ME,MI,MN,MO,MS,MT,NC,ND,NE,NH,NJ,NM,NV,NY,OH,OK,OR,PA,RI,SC,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
False,49,72,44,60,25,57,62,49,52,55,46,50,41,64,53,62,57,51,47,54,53,49,57,69,56,51,54,57,56,56,47,50,56,52,68,68,52,67,37,59,46,52,48,54,62,72,65,52,71,96,68
True,3,8,11,4,9,9,12,5,9,8,8,3,3,9,5,9,13,8,4,11,17,13,16,15,7,14,14,11,6,5,9,18,6,14,15,10,9,11,8,6,14,8,5,18,10,5,8,14,7,10,9


In [30]:
# And yet another - this time showing both churn and total day minutes:
df.pivot_table(['Churn','Total day minutes'],['State'],aggfunc='mean')

Unnamed: 0_level_0,Churn,Total day minutes
State,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,0.057692,178.384615
AL,0.1,186.01
AR,0.2,176.116364
AZ,0.0625,171.604687
CA,0.264706,183.564706
CO,0.136364,178.712121
CT,0.162162,175.140541
DC,0.092593,171.37963
DE,0.147541,174.583607
FL,0.126984,179.533333


# Some data manipulation:

In [31]:
# Add a column in end position:
meanDayMinutes = df['Total day minutes'].mean()
df['Heavy day user'] = df['Total day minutes'] > meanDayMinutes
df.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn,Heavy day user
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False,True
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False,True
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False,True
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False,False


In [32]:
# Delete a column:
df.drop('Heavy day user',axis=1,inplace=True)
# use axis=0 to delete rows
df.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [33]:
# Add a column in a specified position:
df.insert(loc=1, column='Heavy day user', value = df['Total day minutes']>meanDayMinutes)
df.head()

Unnamed: 0,State,Heavy day user,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,True,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,False,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,True,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,True,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,False,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [34]:
df['Heavy day user'].value_counts()

False    1673
True     1660
Name: Heavy day user, dtype: int64

In [35]:
# Are heavy day users more likely to churn?
df.pivot_table(['Churn'],['Heavy day user'],aggfunc=['mean',len])


Unnamed: 0_level_0,mean,len
Unnamed: 0_level_1,Churn,Churn
Heavy day user,Unnamed: 1_level_2,Unnamed: 2_level_2
False,0.114764,1673
True,0.175301,1660


In [36]:
# Drop rows with churn unavailable
# (will do nothing on this dataset):
df.dropna(axis=0, subset=['Churn'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3333 entries, 0 to 3332
Data columns (total 21 columns):
State                     3333 non-null object
Heavy day user            3333 non-null bool
Account length            3333 non-null int64
Area code                 3333 non-null int64
International plan        3333 non-null object
Voice mail plan           3333 non-null object
Number vmail messages     3333 non-null int64
Total day minutes         3333 non-null float64
Total day calls           3333 non-null int64
Total day charge          3333 non-null float64
Total eve minutes         3333 non-null float64
Total eve calls           3333 non-null int64
Total eve charge          3333 non-null float64
Total night minutes       3333 non-null float64
Total night calls         3333 non-null int64
Total night charge        3333 non-null float64
Total intl minutes        3333 non-null float64
Total intl calls          3333 non-null int64
Total intl charge         3333 non-null float64
C

In [37]:
# convert 'International plan' and 'Voice mail plan' to int 0/1:
df['International plan'] = (df['International plan']=='Yes').astype('int')
# Or could use map (supply a list of original -> transformed values):
# df['Voice mail plan'] = df['Voice mail plan'].map({'Yes':1,'No':0})
# Or could use apply (most general - supply a function that converts the values):
df['Voice mail plan'] = df['Voice mail plan'].apply(lambda x : int(x=='Yes'))

# convert 'Heavy day user' and 'Churn' to int 0/1:
df['Heavy day user'] = df['Heavy day user'].astype('int')
df['Churn'] = df['Churn'].astype('int')
df.head()

Unnamed: 0,State,Heavy day user,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,1,128,415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,OH,0,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
2,NJ,1,137,415,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
3,OH,1,84,408,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,OK,0,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


In [38]:
# Convert DataFrame column to numpy array (ex. for use by ML algorithms):
df['Churn'].values

array([0, 0, 0, ..., 0, 0, 0])

In [39]:
# Convert DataFrame to numpy matrix (ex. for use by ML algorithms):
df.drop(['State','Churn'],axis=1).values

array([[  1.  , 128.  , 415.  , ...,   3.  ,   2.7 ,   1.  ],
       [  0.  , 107.  , 415.  , ...,   3.  ,   3.7 ,   1.  ],
       [  1.  , 137.  , 415.  , ...,   5.  ,   3.29,   0.  ],
       ...,
       [  1.  ,  28.  , 510.  , ...,   6.  ,   3.81,   2.  ],
       [  1.  , 184.  , 510.  , ...,  10.  ,   1.35,   2.  ],
       [  1.  ,  74.  , 415.  , ...,   4.  ,   3.7 ,   0.  ]])