- Created date: 2023. 02. 07
- Last updated: 2023. 02. 07
- Author: Chongho Pyo
- Version: 0.1

- Changes
    - label encoding 'media_type'
    - pd.get_dummies(drop_first = True)

In [1]:
import pandas as pd
import numpy as np

from pandas_profiling import ProfileReport

import missingno as msno

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import IsolationForest

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import seaborn as sns
import matplotlib.pyplot as plt

# Data Acquisition

## Read dataset

In [2]:
df_raw = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/InformationSystem/GroupProject/media prediction and its cost.csv")

## Data Structure Overview

In [3]:
df_raw.head()

Unnamed: 0,food_category,food_department,food_family,store_sales(in millions),store_cost(in millions),unit_sales(in millions),promotion_name,sales_country,marital_status,gender,...,grocery_sqft,frozen_sqft,meat_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,media_type,cost
0,Breakfast Foods,Frozen Foods,Food,7.36,2.7232,4.0,Bag Stuffers,USA,M,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",126.62
1,Breakfast Foods,Frozen Foods,Food,5.52,2.5944,3.0,Cash Register Lottery,USA,M,M,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",59.86
2,Breakfast Foods,Frozen Foods,Food,3.68,1.3616,2.0,High Roller Savings,USA,S,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",84.16
3,Breakfast Foods,Frozen Foods,Food,3.68,1.1776,2.0,Cash Register Lottery,USA,M,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,In-Store Coupon,95.78
4,Breakfast Foods,Frozen Foods,Food,4.08,1.428,3.0,Double Down Sale,USA,M,M,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,Radio,50.79


In [4]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60428 entries, 0 to 60427
Data columns (total 40 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   food_category               60428 non-null  object 
 1   food_department             60428 non-null  object 
 2   food_family                 60428 non-null  object 
 3   store_sales(in millions)    60428 non-null  float64
 4   store_cost(in millions)     60428 non-null  float64
 5   unit_sales(in millions)     60428 non-null  float64
 6   promotion_name              60428 non-null  object 
 7   sales_country               60428 non-null  object 
 8   marital_status              60428 non-null  object 
 9   gender                      60428 non-null  object 
 10  total_children              60428 non-null  float64
 11  education                   60428 non-null  object 
 12  member_card                 60428 non-null  object 
 13  occupation                  604

In [5]:
# profile = ProfileReport(df_raw, title="Profiling Report")

# # To retrieve the list of variables which are rejected due to high correlation
# rejected_variables = profile.get_rejected_variables()

# profile

### Columns based on data types

In [6]:
df_raw.dtypes.value_counts()

float64    23
object     17
dtype: int64

In [7]:
float_cols = df_raw.select_dtypes(include = ['float']).columns.tolist()
obj_cols = df_raw.select_dtypes(include = ['object']).columns.tolist()

print("Numerical columns:", float_cols)
print("Categorical columns:", obj_cols)

Numerical columns: ['store_sales(in millions)', 'store_cost(in millions)', 'unit_sales(in millions)', 'total_children', 'avg_cars_at home(approx)', 'num_children_at_home', 'avg_cars_at home(approx).1', 'SRP', 'gross_weight', 'net_weight', 'recyclable_package', 'low_fat', 'units_per_case', 'store_sqft', 'grocery_sqft', 'frozen_sqft', 'meat_sqft', 'coffee_bar', 'video_store', 'salad_bar', 'prepared_food', 'florist', 'cost']
Categorical columns: ['food_category', 'food_department', 'food_family', 'promotion_name', 'sales_country', 'marital_status', 'gender', 'education', 'member_card', 'occupation', 'houseowner', 'avg. yearly_income', 'brand_name', 'store_type', 'store_city', 'store_state', 'media_type']


Some columns in float_cols are pre-dummified, which means they are categorical attributes. Therefore, all attributes were split into three parts: num_cols, cate_cols, pre_dum_cols
- num_cols: numerical columns
- cate_cols: categorical columns (object dtypes)
- pre_dum_cols: pre-dummified columns (e.g., boolean dtypes)

In [8]:
# To see if a column is numerical or categorical

num_cols, cate_cols, pre_dum_cols = [], [], []
for c in df_raw.columns:
    uni_len = len(set(df_raw[c]))
    dtype = df_raw[c].dtypes

    if dtype == object:
        cate_cols.append(c)
    else: # dtype == float
        if uni_len == 2:
            pre_dum_cols.append(c)
        elif uni_len < 10:
            num_cols.append(c)
            print("Number of unique values: %d \t Datatype: %s \t Column name: %s" %(uni_len, dtype, c))
        else:
            num_cols.append(c)

Number of unique values: 6 	 Datatype: float64 	 Column name: unit_sales(in millions)
Number of unique values: 6 	 Datatype: float64 	 Column name: total_children
Number of unique values: 5 	 Datatype: float64 	 Column name: avg_cars_at home(approx)
Number of unique values: 6 	 Datatype: float64 	 Column name: num_children_at_home
Number of unique values: 5 	 Datatype: float64 	 Column name: avg_cars_at home(approx).1


In [9]:
len(num_cols) + len(pre_dum_cols) + len(cate_cols)

40

### Descriptive Statistics

In [10]:
# Columns with numerical values
df_raw[num_cols].describe()

Unnamed: 0,store_sales(in millions),store_cost(in millions),unit_sales(in millions),total_children,avg_cars_at home(approx),num_children_at_home,avg_cars_at home(approx).1,SRP,gross_weight,net_weight,units_per_case,store_sqft,grocery_sqft,frozen_sqft,meat_sqft,cost
count,60428.0,60428.0,60428.0,60428.0,60428.0,60428.0,60428.0,60428.0,60428.0,60428.0,60428.0,60428.0,60428.0,60428.0,60428.0,60428.0
mean,6.541031,2.61946,3.093169,2.533875,2.200271,0.829351,2.200271,2.115258,13.806433,11.796289,18.860694,27988.477494,19133.799696,5312.852552,3541.84628,99.262366
std,3.463047,1.453009,0.827677,1.490165,1.109644,1.303424,1.109644,0.932829,4.622693,4.682986,10.258555,5701.02209,3987.395735,1575.907263,1050.471635,30.011257
min,0.51,0.1632,1.0,0.0,0.0,0.0,0.0,0.5,6.0,3.05,1.0,20319.0,13305.0,2452.0,1635.0,50.79
25%,3.81,1.5,3.0,1.0,1.0,0.0,1.0,1.41,9.7,7.71,10.0,23593.0,16232.0,4746.0,3164.0,69.65
50%,5.94,2.3856,3.0,3.0,2.0,0.0,2.0,2.13,13.6,11.6,19.0,27694.0,18670.0,5062.0,3375.0,98.52
75%,8.67,3.484025,4.0,4.0,3.0,1.0,3.0,2.79,17.7,16.0,28.0,30797.0,22123.0,5751.0,3834.0,126.62
max,22.92,9.7265,6.0,5.0,4.0,5.0,4.0,3.98,21.9,20.8,36.0,39696.0,30351.0,9184.0,6122.0,149.75


In [11]:
# Columns with categorical values
df_raw[cate_cols].describe()

Unnamed: 0,food_category,food_department,food_family,promotion_name,sales_country,marital_status,gender,education,member_card,occupation,houseowner,avg. yearly_income,brand_name,store_type,store_city,store_state,media_type
count,60428,60428,60428,60428,60428,60428,60428,60428,60428,60428,60428,60428,60428,60428,60428,60428,60428
unique,45,22,3,49,3,2,2,5,4,5,2,8,111,5,19,10,13
top,Vegetables,Produce,Food,Weekend Markdown,USA,S,F,Partial High School,Bronze,Professional,Y,$30K - $50K,Hermanos,Supermarket,Tacoma,WA,"Daily Paper, Radio"
freq,7440,8521,43284,2330,38892,30355,30942,18201,33807,19915,36510,19514,1839,26192,5704,19370,6820


## Missing Values

In [12]:
msno.matrix(df_raw)

<matplotlib.axes._subplots.AxesSubplot at 0x7f7314eae8b0>

In [13]:
df_raw.isnull().sum()

food_category                 0
food_department               0
food_family                   0
store_sales(in millions)      0
store_cost(in millions)       0
unit_sales(in millions)       0
promotion_name                0
sales_country                 0
marital_status                0
gender                        0
total_children                0
education                     0
member_card                   0
occupation                    0
houseowner                    0
avg_cars_at home(approx)      0
avg. yearly_income            0
num_children_at_home          0
avg_cars_at home(approx).1    0
brand_name                    0
SRP                           0
gross_weight                  0
net_weight                    0
recyclable_package            0
low_fat                       0
units_per_case                0
store_type                    0
store_city                    0
store_state                   0
store_sqft                    0
grocery_sqft                  0
frozen_s

# Pre-processing

## Data Cleaning

### Outlier Detection

In [14]:
'''
Since 43884 outliers (73%) were removed without specifying contamination and max_samples, 
the thresholds (contamination, max_samples) were set
'''

isolation_forest = IsolationForest(contamination = 0.05, max_samples = 100, random_state = 0)

# DataFrame to store the outlier predictions
outlier_preds = pd.DataFrame()

# Iterate all numerical columns
for col in num_cols:
    # Fit the Isolation Forest to the current column
    isolation_forest.fit(df_raw[[col]])
    
    # Get the outlier predictions for the current column
    outlier_pred = isolation_forest.predict(df_raw[[col]])
    
    # Add the outlier predictions as a new column in the outlier_preds DataFrame
    outlier_preds[col] = outlier_pred

outlier_preds



Unnamed: 0,store_sales(in millions),store_cost(in millions),unit_sales(in millions),total_children,avg_cars_at home(approx),num_children_at_home,avg_cars_at home(approx).1,SRP,gross_weight,net_weight,units_per_case,store_sqft,grocery_sqft,frozen_sqft,meat_sqft,cost
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60423,1,1,-1,1,1,1,1,1,1,1,1,1,1,1,1,1
60424,1,-1,-1,1,1,1,1,1,1,1,-1,1,1,1,1,1
60425,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
60426,1,1,1,1,1,1,1,1,1,1,1,1,-1,1,1,1


### Drop Outliers

In [15]:
df_woOut = df_raw.iloc[outlier_pred == 1]
df_woOut

Unnamed: 0,food_category,food_department,food_family,store_sales(in millions),store_cost(in millions),unit_sales(in millions),promotion_name,sales_country,marital_status,gender,...,grocery_sqft,frozen_sqft,meat_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,media_type,cost
0,Breakfast Foods,Frozen Foods,Food,7.36,2.7232,4.0,Bag Stuffers,USA,M,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",126.62
1,Breakfast Foods,Frozen Foods,Food,5.52,2.5944,3.0,Cash Register Lottery,USA,M,M,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",59.86
2,Breakfast Foods,Frozen Foods,Food,3.68,1.3616,2.0,High Roller Savings,USA,S,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",84.16
3,Breakfast Foods,Frozen Foods,Food,3.68,1.1776,2.0,Cash Register Lottery,USA,M,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,In-Store Coupon,95.78
6,Breakfast Foods,Frozen Foods,Food,5.44,2.5568,4.0,Cash Register Lottery,USA,S,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,In-Store Coupon,95.78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60423,Specialty,Carousel,Non-Consumable,2.76,1.3248,1.0,You Save Days,USA,M,F,...,15321.0,4294.0,2863.0,1.0,0.0,0.0,0.0,0.0,In-Store Coupon,95.25
60424,Specialty,Carousel,Non-Consumable,1.60,0.4960,1.0,Price Cutters,USA,S,F,...,15321.0,4294.0,2863.0,1.0,0.0,0.0,0.0,0.0,Sunday Paper,69.42
60425,Specialty,Carousel,Non-Consumable,5.52,2.5392,2.0,Weekend Markdown,USA,M,M,...,15321.0,4294.0,2863.0,1.0,0.0,0.0,0.0,0.0,"Sunday Paper, Radio, TV",67.51
60426,Specialty,Carousel,Non-Consumable,8.28,2.5668,3.0,Sales Days,Canada,S,M,...,27463.0,4193.0,2795.0,1.0,0.0,0.0,0.0,1.0,Sunday Paper,132.88


In [16]:
diff = len(df_raw) - len(df_woOut)

print("Number of dropped outliers:", diff)
print("Proportion of dropped outliers:", round(diff/len(df_raw), 2))

Number of dropped outliers: 3005
Proportion of dropped outliers: 0.05


### Boxplots

In [17]:
# plt.rc('font', size=14)
# plt.rc('axes', labelsize=14, titlesize=10)
# plt.rc('legend', fontsize=14)
# plt.rc('xtick', labelsize=10)
# plt.rc('ytick', labelsize=10)

ncols = 4
nrows = int(np.ceil(len(num_cols) / ncols))

fig, ax = plt.subplots(nrows, ncols, figsize=(15, 15))
ax = ax.flatten()

for i, col in enumerate(num_cols):
    ax[i].boxplot(df_woOut[col])
    ax[i].set_title(f"Boxplot of {col}", fontsize=10, fontweight='bold')
    ax[i].set_ylabel(col, fontsize=10)

for i in range(len(num_cols), len(ax)):
    fig.delaxes(ax[i])
    plt.tight_layout()
    plt.show()

### Histograms

In [18]:
ncols = 4
nrows = int(np.ceil(len(num_cols) / ncols))

fig, ax = plt.subplots(nrows, ncols, figsize=(15, 15))
ax = ax.flatten()

for i, col in enumerate(num_cols):
    ax[i].hist(df_woOut[col], bins = 50, color = 'lightblue', ec = 'grey')
    ax[i].set_title(f"Boxplot of {col}", fontsize=10, fontweight='bold')
    ax[i].set_ylabel(col, fontsize=10)

for i in range(len(num_cols), len(ax)):
    fig.delaxes(ax[i])
    plt.tight_layout()
    plt.show()

## Feature Scaling

### Correlation

- Correlation matrix to figure out what variables are highly correlated
- Since multicollinearity can distort the model's performance, we aim to elimiate either of highly correlated attributes

In [19]:
#cols = [c for c in float_cols if c != 'cost'] # exclude target

plt.figure(figsize = (13, 10))

sns.heatmap(df_woOut[num_cols].corr(), annot = True, cmap = 'coolwarm', annot_kws = {'size' : 7})
plt.title("Correlation Matrix", fontsize = 20, fontweight = 'bold')

Text(0.5, 1.0, 'Correlation Matrix')

In [20]:
# Simple Scatterplots
sns.pairplot(df_woOut[['meat_sqft', 'store_sqft', 'grocery_sqft']])

<seaborn.axisgrid.PairGrid at 0x7f73125cf400>

### Multicollinearity (VIF)

- VIF (Variance Inflation Factor
    - A measure of the magnitude of multicollinearity in a regression model
    - VIF = 1 : no multicollinearity
    - VIF > 5 : usually considered as a hight level of multicollinearity

In [21]:
def vif_calculator(df, dependent_col):
    """
    A function to calculate the variance inflation factor (VIF) for a given set of predictors and a response variable.
    
    Parameters:
    df (DataFrame): A pandas DataFrame containing the predictor variables and the response variable.
    dependent_col (str): The name of the response variable in the DataFrame.
    
    Returns:
    vif (DataFrame): A pandas DataFrame containing the VIF for each predictor and the corresponding feature.
    
    """

    X = df.drop(dependent_col, axis=1)

    vif = pd.DataFrame()
    vif["Features"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif = vif.sort_values(by = "VIF", ascending = False)

    return vif

In [22]:
vif_calculator(df_woOut[num_cols], 'cost')

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,Features,VIF
4,avg_cars_at home(approx),inf
6,avg_cars_at home(approx).1,inf
11,store_sqft,4292223000.0
12,grocery_sqft,2010594000.0
13,frozen_sqft,274892200.0
14,meat_sqft,181328800.0
8,gross_weight,415.2261
9,net_weight,313.1175
0,store_sales(in millions),92.28619
1,store_cost(in millions),47.97441


### Synthesized attributes (optional)
- If the model is to include synthesized attributes below, "division by zero" has to be taken into consideration first.

In [23]:
#Create a new dataframe to store the synthesized attributes

df_syn = df_woOut.copy()

df_syn['sales_per_sqft'] = df_syn['store_sales(in millions)'] / df_syn['store_sqft']
df_syn['cost_per_sqft'] = df_syn['store_cost(in millions)'] / df_syn['store_sqft']
df_syn['units_sold_per_sqft'] = df_syn['unit_sales(in millions)'] / df_syn['store_sqft']
df_syn['avg_cars_per_household'] = df_syn['avg_cars_at home(approx)'] / df_syn['total_children']
df_syn['avg_children_at_home_per_household'] = df_syn['num_children_at_home'] / df_syn['total_children']
df_syn['avg_cars_per_household_with_children'] = df_syn['avg_cars_at home(approx).1'] / df_syn['num_children_at_home']
df_syn['weight_per_unit'] = df_syn['net_weight'] / df_syn['unit_sales(in millions)']
df_syn['weight_per_case'] = df_syn['gross_weight'] / df_syn['units_per_case']
df_syn['grocery_sqft_per_store_sqft'] = df_syn['grocery_sqft'] / df_syn['store_sqft']
df_syn['frozen_sqft_per_store_sqft'] = df_syn['frozen_sqft'] / df_syn['store_sqft']

df_syn.head()

Unnamed: 0,food_category,food_department,food_family,store_sales(in millions),store_cost(in millions),unit_sales(in millions),promotion_name,sales_country,marital_status,gender,...,sales_per_sqft,cost_per_sqft,units_sold_per_sqft,avg_cars_per_household,avg_children_at_home_per_household,avg_cars_per_household_with_children,weight_per_unit,weight_per_case,grocery_sqft_per_store_sqft,frozen_sqft_per_store_sqft
0,Breakfast Foods,Frozen Foods,Food,7.36,2.7232,4.0,Bag Stuffers,USA,M,F,...,0.000266,9.8e-05,0.000144,1.0,1.0,1.0,4.425,1.158824,0.674153,0.19553
1,Breakfast Foods,Frozen Foods,Food,5.52,2.5944,3.0,Cash Register Lottery,USA,M,M,...,0.000199,9.4e-05,0.000108,inf,,inf,5.9,1.158824,0.674153,0.19553
2,Breakfast Foods,Frozen Foods,Food,3.68,1.3616,2.0,High Roller Savings,USA,S,F,...,0.000133,4.9e-05,7.2e-05,0.25,0.0,inf,8.85,1.158824,0.674153,0.19553
3,Breakfast Foods,Frozen Foods,Food,3.68,1.1776,2.0,Cash Register Lottery,USA,M,F,...,0.000133,4.3e-05,7.2e-05,1.0,1.0,1.0,8.85,1.158824,0.674153,0.19553
6,Breakfast Foods,Frozen Foods,Food,5.44,2.5568,4.0,Cash Register Lottery,USA,S,F,...,0.000196,9.2e-05,0.000144,0.5,0.0,inf,1.2775,0.245517,0.674153,0.19553


### StandardScaler vs MinMax ?
- Depends on models to be used

In [24]:
cols = [c for c in num_cols if c != 'cost'] # exclude target

min_max_scaler = MinMaxScaler(feature_range=(-1, 1))
MinMax_scaled = min_max_scaler.fit_transform(df_woOut[cols])

MinMax_scaled

array([[-0.38866577, -0.46461995,  0.2       , ..., -0.37052681,
        -0.11972668, -0.11967907],
       [-0.55287818, -0.49155626, -0.2       , ..., -0.37052681,
        -0.11972668, -0.11967907],
       [-0.71709058, -0.74937522, -0.6       , ..., -0.37052681,
        -0.11972668, -0.11967907],
       ...,
       [-0.55287818, -0.50310039, -0.6       , ..., -0.76346357,
        -0.45276292, -0.45264096],
       [-0.30655957, -0.49732833, -0.2       , ...,  0.66115218,
        -0.48276887, -0.48295075],
       [-0.22445337, -0.14908034,  0.2       , ...,  0.66115218,
        -0.48276887, -0.48295075]])

In [25]:
cols = [c for c in num_cols if c != 'cost'] # exclude target

std_scaler = StandardScaler()
std_scaled = std_scaler.fit_transform(df_woOut[cols])
std_scaled

array([[ 0.23680601,  0.07233608,  1.09602519, ..., -0.09927367,
         0.07636489,  0.07642966],
       [-0.29394127, -0.01624973, -0.11019699, ..., -0.09927367,
         0.07636489,  0.07642966],
       [-0.82468854, -0.86414255, -1.31641916, ..., -0.09927367,
         0.07636489,  0.07642966],
       ...,
       [-0.29394127, -0.05421508, -1.31641916, ..., -0.9376978 ,
        -0.64546658, -0.64517334],
       [ 0.50217965, -0.03523241, -0.11019699, ...,  2.10205907,
        -0.71050224, -0.71086143],
       [ 0.76755329,  1.11005565,  1.09602519, ...,  2.10205907,
        -0.71050224, -0.71086143]])

### Dummifying categorial variables

In [42]:
#df = pd.DataFrame(std_scaled, columns = [c for c in num_cols if c != 'cost'])
df = pd.DataFrame(MinMax_scaled, columns = [c for c in num_cols if c != 'cost'])

# Concatenate standardized/normalized numerical columns with the others
df = pd.concat([df.reset_index(drop=True), 
                df_woOut[cate_cols + pre_dum_cols].reset_index(drop=True)], axis=1)

In [43]:
print("Number of unique values of 'media_type':", len(set(df['media_type'])))

Number of unique values of 'media_type': 13


**For the purpose of causal inference, the treatment attribute 'media_type' is encoded by label encoding, instead of one-hot encoding.**
- Labeled values are stored in an additional column, 'media_type_labeled'

In [44]:
from sklearn import preprocessing 

label_encoder = preprocessing.LabelEncoder() 
label_encoder.fit_transform(df['media_type'].tolist())

# to decode values 
# label_encoder.inverse_transform([1, 1, 0])

df['media_type_labeled'] = label_encoder.fit_transform(df['media_type'].tolist())

In [46]:
# Dummify categorical variables except for 'media type'
df = pd.get_dummies(df[[c for c in df.columns if c != 'media_type']], drop_first = True)

In [47]:
df # Run regression model on df

Unnamed: 0,store_sales(in millions),store_cost(in millions),unit_sales(in millions),total_children,avg_cars_at home(approx),num_children_at_home,avg_cars_at home(approx).1,SRP,gross_weight,net_weight,...,store_state_BC,store_state_CA,store_state_DF,store_state_Guerrero,store_state_Jalisco,store_state_OR,store_state_Veracruz,store_state_WA,store_state_Yucatan,store_state_Zacatecas
0,-0.388666,-0.464620,0.2,-0.6,-0.5,-0.6,-0.5,-0.229885,0.723270,0.650704,...,0,0,0,0,0,1,0,0,0,0
1,-0.552878,-0.491556,-0.2,-1.0,1.0,-1.0,1.0,-0.229885,0.723270,0.650704,...,0,0,0,0,0,1,0,0,0,0
2,-0.717091,-0.749375,-0.6,0.6,-0.5,-1.0,-0.5,-0.229885,0.723270,0.650704,...,0,0,0,0,0,1,0,0,0,0
3,-0.717091,-0.787856,-0.6,-0.2,0.0,-0.2,0.0,-0.229885,0.723270,0.650704,...,0,0,0,0,0,1,0,0,0,0
4,-0.560018,-0.499420,0.2,0.6,0.0,-1.0,0.0,-0.505747,-0.859119,-0.767887,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57418,-0.799197,-0.757071,-1.0,-0.6,-0.5,-0.6,-0.5,0.298851,0.710692,0.752113,...,0,1,0,0,0,0,0,0,0,0
57419,-0.902722,-0.930401,-1.0,-0.2,0.0,-1.0,0.0,-0.367816,0.433962,0.380282,...,0,1,0,0,0,0,0,0,0,0
57420,-0.552878,-0.503100,-0.6,-0.6,0.5,-1.0,0.5,0.298851,0.710692,0.752113,...,0,1,0,0,0,0,0,0,0,0
57421,-0.306560,-0.497328,-0.2,-0.2,1.0,-1.0,1.0,0.298851,0.710692,0.752113,...,1,0,0,0,0,0,0,0,0,0


In [55]:
df['media_type_labeled']

0         3
1         3
2         3
3         5
4         5
         ..
57418     5
57419     9
57420    11
57421     9
57422     3
Name: media_type_labeled, Length: 57423, dtype: int64