# Process of Data Science
## Assignment 1
## Question 1

In [1]:
import pandas as pd
import numpy as np

## Read the CSV file

In [14]:
df = pd.read_csv('us_shipping.csv')
df.head()

Unnamed: 0,Origin,Mode,Year,Value,Tons,Total_miles,Mean_miles
0,Alabama,All modes,2012,214750,191500,51227,353
1,Alabama,All modes,2007,182785,268926,58222,396
2,Alabama,Single modes,2012,186001,183784,44765,182
3,Alabama,Single modes,2007,152202,257077,51295,168
4,Alabama,Truck,2012,172409,153300,27111,165


## Convert all numeric values to real (float) values

In [15]:
df[['Year', 'Value', 'Tons', 'Total_miles', 'Mean_miles']] = df[['Year', 'Value', 'Tons', 'Total_miles', 'Mean_miles']].apply(pd.to_numeric, errors='coerce')
df.head()

Unnamed: 0,Origin,Mode,Year,Value,Tons,Total_miles,Mean_miles
0,Alabama,All modes,2012,214750.0,191500.0,51227.0,353.0
1,Alabama,All modes,2007,182785.0,268926.0,58222.0,396.0
2,Alabama,Single modes,2012,186001.0,183784.0,44765.0,182.0
3,Alabama,Single modes,2007,152202.0,257077.0,51295.0,168.0
4,Alabama,Truck,2012,172409.0,153300.0,27111.0,165.0


## Handle missing values
### Replacing NaN's with the mean for each numeric column

In [16]:
df[['Year', 'Value', 'Tons', 'Total_miles', 'Mean_miles']] = df[['Year', 'Value', 'Tons', 'Total_miles', 'Mean_miles']].fillna(df.mean())
df.head()

Unnamed: 0,Origin,Mode,Year,Value,Tons,Total_miles,Mean_miles
0,Alabama,All modes,2012,214750.0,191500.0,51227.0,353.0
1,Alabama,All modes,2007,182785.0,268926.0,58222.0,396.0
2,Alabama,Single modes,2012,186001.0,183784.0,44765.0,182.0
3,Alabama,Single modes,2007,152202.0,257077.0,51295.0,168.0
4,Alabama,Truck,2012,172409.0,153300.0,27111.0,165.0


## Creating a 2nd DataFrame 
- Each state as a row
- Each mode as a column
- Each cell is the value of goods that originated from each state by the given mode in 2012

In [17]:
# Extract all rows with Year = 2012
df2 = df.query('Year==2012')

# Remove unwanted data
df2 = df2[['Origin', 'Mode', 'Value']]

# Group all rows by state, set each mode as columns, and fill in table with values from the Value column
df2 = pd.pivot_table(df2, values='Value', index='Origin', columns='Mode')

df2.head()

Mode,Air (incl truck and air),All modes,Deep sea,For-hire truck,Great Lakes,Inland water,Multiple Waterways,Multiple modes,Other modes,Other multiple modes,"Parcel, U.S.P.S. or courier",Pipeline,Private truck,Rail,Rail and water,Single modes,Truck,Truck and rail,Truck and water,Water
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Alabama,1458.0,214750.0,0.0,114642.0,,66099.792334,243.0,28749.0,0.0,0.0,15409.0,448.0,57767.0,10663.0,0.0,186001.0,172409.0,12464.0,66099.792334,66099.792334
Alaska,2695.0,19848.0,2909.0,2883.0,,212.0,,1036.0,0.0,0.0,442.0,2054.0,6647.0,1412.0,3.0,18812.0,9530.0,66099.792334,581.0,3121.0
Arizona,8168.0,147147.0,,70781.0,,,,26072.0,0.0,,25894.0,0.0,40530.0,1596.0,,121075.0,111311.0,138.0,66099.792334,
Arkansas,66099.792334,114095.0,,58953.0,,480.0,,8186.0,0.0,0.0,4701.0,0.0,38821.0,6142.0,66099.792334,105909.0,97775.0,2467.0,760.0,480.0
California,78193.0,1476407.0,66099.792334,624281.0,,9376.0,,292281.0,0.0,0.0,271957.0,59176.0,383233.0,19843.0,66099.792334,1184126.0,1007514.0,16157.0,3762.0,19400.0


## Function for displaying the proportional value for each mode in df2

In [6]:
def proportional_values(df, state):
    """
    Computes the proportional values of exports 
    shipped by each mode for the given state.
    :param df: a pandas DataFrame which holds all the data.
    :param state: a string which contains the state.
    :return: a dictionary that contains each mode as well as
    its proportional value for the given state.
    """
    results = {}
    modes = df.loc[state]
    total = modes.sum()
    for col in df.columns:
        value = modes[col]
        if value >= 0 and value != pd.np.nan:
            results[col] = value / total
    return results

## Testing the proportional_values function

In [7]:
import json

print('Test1: Missouri')
map = proportional_values(df2, 'Missouri')
print(json.dumps(map, indent=2))
sum = 0
for value in map.values():
    sum += value
print("Sum: ", sum)

print('\nTest2: Alabama')
map = proportional_values(df2, 'Alabama')
print(json.dumps(map, indent=2))
sum = 0
for value in map.values():
    sum += value
print("Sum: ", sum)

Test1: Missouri
{
  "Air (incl truck and air)": 0.06391865857662701,
  "All modes": 0.23440525251881825,
  "For-hire truck": 0.11408307648042153,
  "Inland water": 0.0019649488998458716,
  "Multiple Waterways": 3.384508439695153e-05,
  "Multiple modes": 0.04111404152300537,
  "Other modes": 0.0,
  "Other multiple modes": 0.0,
  "Parcel, U.S.P.S. or courier": 0.03674995964062131,
  "Pipeline": 0.0,
  "Private truck": 0.06105556524968921,
  "Rail": 0.008055130086474464,
  "Rail and water": 0.06391865857662701,
  "Single modes": 0.19329121099581287,
  "Truck": 0.17513864173011073,
  "Truck and rail": 0.0030586286270730765,
  "Truck and water": 0.0012135880262335477,
  "Water": 0.001998793984242823
}
Sum:  1.0

Test2: Alabama
{
  "Air (incl truck and air)": 0.0014388597452140257,
  "All modes": 0.21193081638183267,
  "Deep sea": 0.0,
  "For-hire truck": 0.11313700885516209,
  "Inland water": 0.06523205099939038,
  "Multiple Waterways": 0.00023980995753567096,
  "Multiple modes": 0.02837159

### 1) Which modes has the biggest positive and negative differences in tons-shipped from 2007 - 2012?



In [8]:
df07 = df.query('Year==2007')
df12 = df.query('Year==2012')

df07 = df07[['Origin', 'Mode', 'Tons']]
df12 = df12[['Origin', 'Mode', 'Tons']]

assert(len(df07) == len(df12))

# excluding all modes that are sums of multiple modes.
# this is because I assume the question asks for the 
# specific mode, and not a combination of modes.
mode_exclusion_list = [
    'All modes', 
    'Multiple modes', 
    'Single modes', 
    'Other multiple modes', 
    'Other modes'
]

positive_diff_list = []
negative_diff_list = []
for i in range(len(df07)):
    row07 = df07.iloc[i]
    row12 = df12.iloc[i]
    assert(row07.Origin == row12.Origin)
    assert(row07.Mode == row12.Mode)
    if row07.Mode in mode_exclusion_list:
        positive_diff_list.append(0)
        negative_diff_list.append(0)
    else:
        pos_diff = abs(row12.Tons - row07.Tons)
        neg_diff = row12.Tons - row07.Tons
        positive_diff_list.append(pos_diff)
        negative_diff_list.append(neg_diff)

pos_i = positive_diff_list.index(max(positive_diff_list))
neg_i = negative_diff_list.index(min(negative_diff_list))

print('Biggest positive difference in tons-shipped from 2007-2012:')
print("Origin: {}, Mode: {}".format(df07.iloc[pos_i].Origin, df07.iloc[pos_i].Mode))

print('\nBiggest negative difference in tons-shipped from 2007-2012:')
print("Origin: {}, Mode: {}".format(df07.iloc[neg_i].Origin, df07.iloc[neg_i].Mode))

Biggest positive difference in tons-shipped from 2007-2012:
Origin: Texas, Mode: Truck

Biggest negative difference in tons-shipped from 2007-2012:
Origin: California, Mode: Truck


### 2) Which state decreased the most in export value across all modes from 2007 - 2012?

In [13]:
df07 = df.query('Year==2007')
df12 = df.query('Year==2012')

df07 = df07[['Origin', 'Mode', 'Value']]
df12 = df12[['Origin', 'Mode', 'Value']]

df07 = df07[df07['Mode'] == 'All modes']
df12 = df12[df12['Mode'] == 'All modes']

assert(len(df07) == len(df12))

largest = 0
largestIndex = 0
for i in range(len(df07)):
    row07 = df07.iloc[i]
    row12 = df12.iloc[i]
    assert(row07.Origin == row12.Origin)
    diff = row07.Value - row12.Value
    if diff > largest:
        largest = diff
        largestIndex = i

print('The state that decreased the most in export value across all modes from 2007-2012 is:')
print(df07.iloc[largestIndex].Origin)

The state that decreased the most in export value across all modes from 2007-2012 is:
Tennessee
