In [1]:
# modules we'll use
import pandas as pd
import numpy as np

# for Box-Cox Transformation
from scipy import stats

# for min_max scaling
from mlxtend.preprocessing import minmax_scaling

# plotting modules
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
kickstarters_2017 = pd.read_csv('C:../../Dataset/ks-projects-201801.csv')

In [3]:
np.random.seed(0)

In [4]:
original_data = pd.DataFrame(kickstarters_2017.usd_goal_real)

In [5]:
original_data2 = pd.DataFrame(kickstarters_2017['usd_goal_real'])

In [6]:
original_data

Unnamed: 0,usd_goal_real
0,1533.95
1,30000.00
2,45000.00
3,5000.00
4,19500.00
...,...
378656,50000.00
378657,1500.00
378658,15000.00
378659,15000.00


In [7]:
original_data2

Unnamed: 0,usd_goal_real
0,1533.95
1,30000.00
2,45000.00
3,5000.00
4,19500.00
...,...
378656,50000.00
378657,1500.00
378658,15000.00
378659,15000.00


In [8]:
original_data3 = pd.DataFrame()

In [9]:
original_data3['usd_goal_real']=kickstarters_2017['usd_goal_real']

In [10]:
original_data3

Unnamed: 0,usd_goal_real
0,1533.95
1,30000.00
2,45000.00
3,5000.00
4,19500.00
...,...
378656,50000.00
378657,1500.00
378658,15000.00
378659,15000.00


In [11]:
scaled_data = minmax_scaling(original_data, columns=['usd_goal_real'])

In [12]:
print('Original data\nPreview:\n', original_data.head())
print('Minimum value:', float(original_data.min()),
      '\nMaximum value:', float(original_data.max()))
print('_'*30)

print('\nScaled data\nPreview:\n', scaled_data.head())
print('Minimum value:', float(scaled_data.min()),
      '\nMaximum value:', float(scaled_data.max()))

Original data
Preview:
    usd_goal_real
0        1533.95
1       30000.00
2       45000.00
3        5000.00
4       19500.00
Minimum value: 0.01 
Maximum value: 166361390.71
______________________________

Scaled data
Preview:
    usd_goal_real
0       0.000009
1       0.000180
2       0.000270
3       0.000030
4       0.000117
Minimum value: 0.0 
Maximum value: 1.0


In [13]:
original_data

Unnamed: 0,usd_goal_real
0,1533.95
1,30000.00
2,45000.00
3,5000.00
4,19500.00
...,...
378656,50000.00
378657,1500.00
378658,15000.00
378659,15000.00


In [14]:
original_goal_data = pd.DataFrame(kickstarters_2017.goal)

In [15]:
scaled_goal_data = minmax_scaling(original_goal_data, columns=['goal'])

In [16]:
scaled_goal_data

Unnamed: 0,goal
0,0.000010
1,0.000300
2,0.000450
3,0.000050
4,0.000195
...,...
378656,0.000500
378657,0.000015
378658,0.000150
378659,0.000150


In [17]:
original_goal_data

Unnamed: 0,goal
0,1000.0
1,30000.0
2,45000.0
3,5000.0
4,19500.0
...,...
378656,50000.0
378657,1500.0
378658,15000.0
378659,15000.0


In [18]:
index_of_positive_pledges = kickstarters_2017.usd_pledged_real > 0

In [19]:
index_of_positive_pledges

0         False
1          True
2          True
3          True
4          True
          ...  
378656     True
378657     True
378658     True
378659     True
378660     True
Name: usd_pledged_real, Length: 378661, dtype: bool

In [20]:
positive_pledges = kickstarters_2017.usd_pledged_real.loc[index_of_positive_pledges]

In [21]:
positive_pledges

1          2421.0
2           220.0
3             1.0
4          1283.0
5         52375.0
           ...   
378656       25.0
378657      155.0
378658       20.0
378659      200.0
378660      524.0
Name: usd_pledged_real, Length: 326134, dtype: float64

In [22]:
# normalize the pledges (w/ Box-Cox)
normalized_pledges = pd.Series(stats.boxcox(positive_pledges)[0], 
                               name='usd_pledged_real', index=positive_pledges.index)

In [23]:
print('Original data\nPreview:\n', positive_pledges.head())
print('Minimum value:', float(positive_pledges.min()),
      '\nMaximum value:', float(positive_pledges.max()))
print('_'*30)

print('\nNormalized data\nPreview:\n', normalized_pledges.head())
print('Minimum value:', float(normalized_pledges.min()),
      '\nMaximum value:', float(normalized_pledges.max()))

Original data
Preview:
 1     2421.0
2      220.0
3        1.0
4     1283.0
5    52375.0
Name: usd_pledged_real, dtype: float64
Minimum value: 0.45 
Maximum value: 20338986.27
______________________________

Normalized data
Preview:
 1    10.165142
2     6.468598
3     0.000000
4     9.129277
5    15.836853
Name: usd_pledged_real, dtype: float64
Minimum value: -0.7779954122762203 
Maximum value: 30.69054020451361


In [24]:
index_of_postive_pledged = kickstarters_2017.pledged > 0
tt = kickstarters_2017.pledged.loc[index_of_postive_pledged]

normalized_tt = pd.Series(stats.boxcox(tt)[0], 
                               name='pledged_real', index=tt.index)


print('Original data\nPreview:\n', tt.head())
print('Minimum value:', float(tt.min()),
      '\nMaximum value:', float(tt.max()))
print('_'*30)

print('\nNormalized data\nPreview:\n', normalized_tt.head())
print('Minimum value:', float(normalized_tt.min()),
      '\nMaximum value:', float(normalized_tt.max()))

Original data
Preview:
 1     2421.0
2      220.0
3        1.0
4     1283.0
5    52375.0
Name: pledged, dtype: float64
Minimum value: 1.0 
Maximum value: 20338986.27
______________________________

Normalized data
Preview:
 1    10.013887
2     6.403367
3     0.000000
4     9.005193
5    15.499596
Name: pledged_real, dtype: float64
Minimum value: 0.0 
Maximum value: 29.63030787418848


In [25]:
normalized_tt

1         10.013887
2          6.403367
3          0.000000
4          9.005193
5         15.499596
            ...    
378656     3.562025
378657     5.919593
378658     3.291544
378659     6.270664
378660     7.648440
Name: pledged_real, Length: 326134, dtype: float64

In [26]:
new_dataset = original_data.join(normalized_tt)

In [27]:
new_dataset

Unnamed: 0,usd_goal_real,pledged_real
0,1533.95,
1,30000.00,10.013887
2,45000.00,6.403367
3,5000.00,0.000000
4,19500.00,9.005193
...,...,...
378656,50000.00,3.562025
378657,1500.00,5.919593
378658,15000.00,3.291544
378659,15000.00,6.270664
