### 1st level. New York City Taxi Trip Duration

- [자료1](https://www.kaggle.com/drgilermo/dynamics-of-new-york-city-animation)
<!-- [자료2](https://www.kaggle.com/aiswaryaramachandran/eda-baseline-model-0-40-rmse), [자료3](https://www.kaggle.com/danijelk/beat-the-benchmark) -->

### prepare

In [None]:
import os, io, base64

# raw data
import pandas as pd
import numpy as np

# for data
from dateutil import parser
from IPython.display import HTML
from imblearn.under_sampling import RandomUnderSampler
from subprocess import check_output

# model
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier

# visualization
import matplotlib.pyplot as plt
from matplotlib import animation, cm

print(check_output(["ls", "../input/nyc-taxi-trip-duration"]).decode("utf8"))

In [None]:
train = pd.read_csv("../input/nyc-taxi-trip-duration/train.zip")

In [None]:
train.head()

범위 밖의 값 지우기

In [None]:
xlim = [-74.03, -73.77]
ylim = [40.63, 40.85]

In [None]:
train = train[(train.pickup_longitude > xlim[0]) & (train.pickup_longitude < xlim[1])]
train = train[(train.dropoff_longitude > xlim[0]) & (train.dropoff_longitude < xlim[1])]
train = train[(train.pickup_latitude > ylim[0]) & (train.pickup_latitude < ylim[1])]
train = train[(train.dropoff_latitude > ylim[0]) & (train.dropoff_latitude < ylim[1])]

In [None]:
longitude = list(train.pickup_longitude) + list(train.dropoff_longitude)
latitude = list(train.pickup_latitude) + list(train.dropoff_latitude)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(longitude, latitude, '.', alpha=.4, markersize=.05)
plt.show()

In [None]:
data = pd.DataFrame()
data["longitude"] = longitude
data["latitude"] = latitude

### Cluster

In [None]:
kmeans = KMeans(n_clusters=15, random_state=2, n_init=10)
kmeans.fit(data)
data["label"] = kmeans.labels_

In [None]:
loc_df = data.sample(200000)
plt.figure(figsize=(10, 10))
for label in loc_df.label.unique():
    plt.plot(loc_df.longitude[loc_df.label == label], loc_df.latitude[loc_df.label == label],
             '.', alpha=.3, markersize=.3)
plt.title("Cluaters of New York")
plt.show()

색으론 14개까지만 구분되는데?

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
for label in loc_df.label.unique():
    ax.plot(loc_df.longitude[loc_df.label == label], loc_df.latitude[loc_df.label == label],
            '.', alpha=.4, markersize=.1, color="gray")
    ax.plot(kmeans.cluster_centers_[label, 0], kmeans.cluster_centers_[label, 1],
            'o', color='r')
    ax.annotate(label, (kmeans.cluster_centers_[label, 0], kmeans.cluster_centers_[label, 1]),
                color='b', fontsize=20)
ax.set_title("Cluster Centers")
plt.show()

14개까지만 본 게 맞았다. 15개까지 나누기엔 너무 많았나? 아니면 샘플링 과정에서 하나가 지워진 건가?<br />
어찌되었든 14개까지만 확인할 수 있었다.

In [None]:
train["pickup_cluster"] = kmeans.predict(train[["pickup_longitude", "pickup_latitude"]])
train["dropoff_cluster"] = kmeans.predict(train[["dropoff_longitude", "dropoff_latitude"]])
train["pickup_hour"] = train.pickup_datetime.apply(lambda x: parser.parse(x).hour)

In [None]:
clusters = pd.DataFrame()
clusters['x'] = kmeans.cluster_centers_[:, 0]
clusters['y'] = kmeans.cluster_centers_[:, 1]
clusters["label"] = range(len(clusters))

In [None]:
loc_df = loc_df.sample(5000)

### taxi 이동에 대한 animation 만들기

In [None]:
def animate(hour):
    ax.clear()
    ax.set_title("Absolute Traffic - Hour" + str(int(hour)) + ":00")
    plt.figure(figsize=(10, 10))
    
    for label in loc_df.label.unique():
        ax.plot(loc_df.longitude[loc_df.label == label], loc_df.latitude[loc_df.label == label],
                '.', alpha=1, markersize=2, color="gray")
        ax.plot(kmeans.cluster_centers_[label, 0], kmeans.cluster_centers_[label, 1],
                'o', color='r')
    
    for label in clusters.label:
        for dest_label in clusters.label:
            num_of_rides = len(train[(train.pickup_cluster == label) &
                                     (train.dropoff_cluster == dest_label) &
                                     (train.pickup_hour == hour)])
            dist_x = clusters.x[clusters.label == label].values[0] - clusters.x[clusters.label == dest_label].values[0]
            dist_y = clusters.y[clusters.label == label].values[0] - clusters.y[clusters.label == dest_label].values[0]
            pct = np.true_divide(num_of_rides, len(train))
            arr = plt.Arrow(clusters.x[clusters.label == label].values,
                        clusters.y[clusters.label == label].values, -dist_x, -dist_y,
                        edgecolor="white", width=15 * pct)
            ax.set_facecolor('g')

In [None]:
sorted_hour = sorted(train.pickup_hour.unique()) # 24

In [None]:
# term = 24
fig = plt.figure(figsize=(10, 10))
# for idx in range(0, int(len(sorted_hour) / term)):
#     start = idx * 6
# ani = animation.FuncAnimation(fig, animate, sorted_hour[start:start + term], interval=10000)
ani = animation.FuncAnimation(fig, animate, sorted_hour, interval=10000)
# plt.close()

# filename = "animation_hour:" + str(hour) + ".gif"
# ani.save(filename, writer="imagemagick", fps=2)
# filename = "animation.gif"
# video = io.open(filename, "r+b").read()
# encoded = base64.b64encode(video)
# encoded = base64.b64encode(ani)
# HTML(data='''<img src="data: image/gif; base64, {0}" type="gif" />'''.format(encoded.decode("ascii")))

animation... 내가 interval이랑 그냥 만들 때마다 저장하는 방식도 해줬는데..<br />
안 되다니...

### Neighborhood Analysis

In [None]:
neighborhood = {-74.0019368351: 'Chelsea',-73.837549761: 'Queens', -73.7854240738: 'JFK',
                -73.9810421975: 'Midtown-North-West', -73.9862336241: 'East Village',
                -73.971273324: 'Midtown-North-East', -73.9866739677: 'Brooklyn-parkslope',
                -73.8690098118: 'LaGuardia', -73.9890572967: 'Midtown',
                -74.0081765545: 'Downtown', -73.9213024854: 'Queens-Astoria',
                -73.9470256923: 'Harlem', -73.9555565018: 'Uppe East Side',
                -73.9453487097: 'Brooklyn-Williamsburgt', -73.9745967889: 'Upper West Side'}

In [None]:
rides_df = pd.DataFrame(columns=neighborhood.values())
rides_df["name"] = neighborhood.values()

In [None]:
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(np.array(list(neighborhood.keys())).reshape(-1, 1), list(neighborhood.values()))

In [None]:
train["pickup_neighborhood"] = neigh.predict(np.array(train.pickup_longitude).reshape(-1, 1))
train["dropoff_neighborhood"] = neigh.predict(np.array(train.dropoff_longitude).reshape(-1, 1))

In [None]:
for col in rides_df.columns[:-1]:
    rides_df[col] = rides_df.name.apply(
        lambda x: len(train[(train.pickup_neighborhood == x) &
                            (train.dropoff_neighborhood == col)]))

In [None]:
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

In [None]:
trace = go.Heatmap(z=np.array(rides_df), x=rides_df.columns[:-1], y=rides_df.columns)
layout = dict(title="<b>Neighborhoods Interaction</b>",
              titlefont = dict(size=30, color=("rgb(100, 100, 100)")),
              margin=dict(t=100, r=100, b=100, l=150),
              yaxis=dict(title="<b>From</b>"), xaxis=dict(title="<b>To</b>"))

In [None]:
plt.figure(figsize=(10, 10))
fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename="labelled-heatmap")
plt.show()

In [None]:
rides_df.index = rides_df.name
rides_df = rides_df.drop("name", axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
for i in range(len(rides_df)):
    ax.plot(rides_df.sum(axis=1)[i], rides_df.sum(axis=0)[i], 'o', color='b')
    ax.annotate(rides_df.index.tolist()[i],
                (rides_df.sum(axis=1)[i], rides_df.sum(axis=0)[i]), color='b', fontsize=12)

ax.plot([0, 250000], [0, 250000], color='r', linewidth=1)

ax.grid("off")
ax.set_xlim([0, 250000])
ax.set_xlabel("Outbound Taxis")
ax.set_ylim([0, 250000])
ax.set_ylabel("Inbound Taxis")
ax.set_title("Inbound and Outbound rides for each cluster")
plt.show()

### Winter vs Summer

In [None]:
train["pickup_month"] = train.pickup_datetime.apply(lambda x: parser.parse(x).month)

In [None]:
rides_df = pd.DataFrame(columns=neighborhood.values())
rides_df["name"] = neighborhood.values()
rides_df.index = rides_df.name

In [None]:
fig, ax = plt.subplots(2, figsize=(12, 12))

for col in rides_df.columns[:-1]:
    rides_df[col] = rides_df.name.apply(lambda x: len(train[
        (train.pickup_neighborhood == x) & (train.dropoff_neighborhood == col) &
        (train.pickup_month == 6)]))

for i in range(len(rides_df)):
    ax[0].plot(rides_df.sum(axis=1)[i], rides_df.sum(axis=0)[i], 'o', color='b')
    ax[0].annotate(rides_df.index.tolist()[i],
                   (rides_df.sum(axis=1)[i], rides_df.sum(axis=0)[i]), color='b', fontsize=12)

ax[0].plot([0, 40000], [0, 40000])
ax[0].grid("off")
ax[0].set_xlim([0, 40000])
ax[0].set_xlabel("Outbound Taxis")
ax[0].set_ylim([0, 40000])
ax[0].set_ylabel("Inbound Taxis")
ax[0].set_title("Inbound and Outbound rides for each cluster - June")

for col in rides_df.columns[:-1]:
    rides_df[col] = rides_df.name.apply(lambda x: len(train[
        (train.pickup_neighborhood == x) & (train.dropoff_neighborhood == col) &
        (train.pickup_month == 1)]))
rides_df = rides_df.drop("name", axis=1)

for i in range(len(rides_df)):
    ax[1].plot(rides_df.sum(axis=1)[i], rides_df.sum(axis=0)[i], 'o', color='b')
    ax[1].annotate(rides_df.index.tolist()[i],
                   (rides_df.sum(axis=1)[i], rides_df.sum(axis=0)[i]), color='b', fontsize=12)

ax[1].plot([0, 40000], [0, 40000])
ax[1].grid("off")
ax[1].set_xlim([0, 40000])
ax[1].set_xlabel("Outbound Taxis")
ax[1].set_ylim([0, 40000])
ax[1].set_ylabel("Inbound Taxis")
ax[1].set_title("Inbound and Outbound rides for each cluster - January")

plt.show()