In [None]:
import pandas as pd
import datetime as dt

data = pd.read_csv("rfm_xmas19.txt", parse_dates=["trans_date"])
group_by_customer = data.groupby("customer_id")
last_transaction = group_by_customer["trans_date"].max()
best_churn = pd.DataFrame(last_transaction)

cutoff_day = dt.datetime(2019, 10, 16)

best_churn["churned"] = best_churn["trans_date"].apply(
    lambda date: 1 if date < cutoff_day else 0
)

In [None]:
best_churn["nr_of_transactions"] = group_by_customer.size()
best_churn["amount_spent"] = group_by_customer.sum()
best_churn.drop("trans_date", axis="columns", inplace=True)

In [None]:
best_churn["scaled_tran"] = (best_churn["nr_of_transactions"] \
                             - best_churn["nr_of_transactions"].min()) \
                             / (best_churn["nr_of_transactions"].max() \
                             - best_churn["nr_of_transactions"].min())

best_churn["scaled_amount"] = (best_churn["amount_spent"] \
                               -best_churn["amount_spent"].min()) \
                               / (best_churn["amount_spent"].max() \
                               - best_churn["amount_spent"].min())

best_churn["score"] = 100*(.5*best_churn["scaled_tran"] \
                           + .5*best_churn["scaled_amount"])

best_churn.sort_values("score", inplace=True, ascending=False)

In [None]:
coupon = data["tran_amount"].mean()*0.3
nr_of_customers = 1000/coupon

In [None]:
top_50_churned = best_churn.loc[best_churn["churned"] == 1].head(50)

top_50_churned.to_csv("best_customers.txt")

In [None]:
playstore = pd.read_csv("googleplaystore.csv")
print(playstore.shape)
answer="no" # We don't care about free apps for this project

playstore.drop(labels=10472, inplace=True)

In [None]:
paid = playstore[playstore["Price"] != 0].copy()
def clean_size(size):
    """Convert file size string to float and megabytes"""
    size = size.replace("M","")
    if size.endswith("k"):
        size = float(size[:-1])/1000
    elif size == "Varies with device":
        size = pd.np.NaN
    else:
        size = float(size)
    return size
paid.drop("Type", axis="columns", inplace=True)
paid["Reviews"] = paid["Reviews"].astype(int)
paid["Size"] = paid["Size"].apply(clean_size).astype(float)
paid.info()

In [None]:
paid.sort_values("Reviews", ascending=False, inplace=True)
paid.drop_duplicates("App", inplace=True)
print(paid.duplicated("App").sum())
paid.reset_index(inplace=True, drop=True)

In [None]:
paid["Price"]

In [None]:
affordable_apps = paid[paid["Price"] < 50].copy()
cheap = affordable_apps["Price"] < 5
reasonable = affordable_apps["Price"] >= 5
affordable_apps[cheap].hist(column="Price", grid=False, figsize=(12,6))
affordable_apps[reasonable].hist(column="Price", grid=False, figsize=(12,6))
affordable_apps["affordability"] = affordable_apps.apply(
    lambda row: "cheap" if row["Price"] < 5 else "reasonable",
    axis=1
)

In [None]:
cheap = affordable_apps["Price"] < 5
reasonable = affordable_apps["Price"] >= 5
cheap_mean = affordable_apps.loc[cheap, "Price"].mean()

affordable_apps.loc[cheap, "price_criterion"] = affordable_apps["Price"].apply(
    lambda price: 1 if price < cheap_mean else 0
)

affordable_apps[reasonable].plot(kind="scatter", x="Price", y="Rating")

reasonable_mean = affordable_apps.loc[reasonable, "Price"].mean()

affordable_apps.loc[reasonable,"price_criterion"] = affordable_apps["Price"].apply(
    lambda price: 1 if price < reasonable_mean else 0
)

In [None]:
affordable_apps["genre_count"] = affordable_apps["Genres"].str.count(";")+1

genres_mean = affordable_apps.groupby(
    ["affordability", "genre_count"]
).mean()[["Price"]]


def label_genres(row):
    """For each segment in `genres_mean`,
    labels the apps that cost less than its segment's mean with `1`
    and the others with `0`."""

    aff = row["affordability"]
    gc = row["genre_count"]
    price = row["Price"]

    if price < genres_mean.loc[(aff, gc)][0]:
        return 1
    else:
        return 0

affordable_apps["genre_criterion"] = affordable_apps.apply(
    label_genres, axis="columns"
)
categories_mean = affordable_apps.groupby(
    ["affordability", "Category"]
).mean()[["Price"]]

def label_categories(row):
    """For each segment in `categories_mean`,
    labels the apps that cost less than its segment's mean with `1`
    and the others with `0`."""

    aff = row["affordability"]
    cat = row["Category"]
    price = row["Price"]

    if price < categories_mean.loc[(aff, cat)][0]:
        return 1
    else:
        return 0

affordable_apps["category_criterion"] = affordable_apps.apply(
    label_categories, axis="columns"
)

In [None]:
criteria = ["price_criterion", "genre_criterion", "category_criterion"]
affordable_apps["Result"] = affordable_apps[criteria].mode(axis='columns')
def new_price(row):
    if row["affordability"] == "cheap":
        return round(max(row["Price"], cheap_mean), 2)
    else:
        return round(max(row["Price"], reasonable_mean), 2)
    
affordable_apps["New Price"] = affordable_apps.apply(new_price, axis="columns")

affordable_apps["Installs"] = affordable_apps["Installs"].str.replace("[+,]", "").astype(int)

affordable_apps["Impact"] = (affordable_apps["New Price"]-affordable_apps["Price"])*affordable_apps["Installs"]

total_impact = affordable_apps["Impact"].sum()
print(total_impact)

In [None]:
def categorize(score):
    """Returns NPS category"""
    if score in range(0,7):
        return "Detractor"
    elif score in (7, 8):
        return "Passive"
    elif score in (9,10):
        return "Promoter"
    return None

In [None]:
df = pd.read_csv("nps.csv", parse_dates=["event_date"])
year = df["event_date"].dt.year
month = df["event_date"].dt.month
df["yearmonth"] = 100*year + month

df["category"] = df["score"].apply(categorize)

nps = df.pivot_table(index="yearmonth", columns="category", aggfunc="size")
nps["total_responses"] = nps.sum(axis="columns")
nps["nps"] = (nps["Promoter"]-nps["Detractor"])/nps["total_responses"]
nps["nps"] = (100*nps["nps"]).astype(int)

In [None]:
subs = pd.read_csv("muscle_labs.csv", parse_dates=["end_date", "start_date"])
subs["churn_month"] = subs["end_date"].dt.year*100 + subs["end_date"].dt.month
monthly_churn = pd.DataFrame({"total_churned": subs.groupby("churn_month").size()})

In [None]:
years = list(range(2011,2015))
months = list(range(1,13))
yearmonths = [y*100+m for y in years for m in months]
yearmonths = yearmonths[:-1]

churn = pd.DataFrame({"yearmonth": yearmonths})
churn = pd.merge(churn, monthly_churn, "left", left_on="yearmonth", right_index=True)
churn.fillna(0, inplace=True)
churn["total_churned"] = churn["total_churned"].astype(int)

In [None]:
import datetime as dt
import matplotlib.pyplot as plt

# arange = __import__("numpy").arange
# Ellipse = __import__("matplotlib").patches.Ellipse
# ax = churn.plot(x="yearmonth", y="churn_rate", figsize=(12,6), rot=45, marker=".")
# start, end = ax.get_xlim()
# ax.get_xticks()
# ax.set_xticks(arange(2, end, 3))
# ax.set_xticklabels(yearmonths[2::3])
# circle = Ellipse((35, churn.loc[churn.yearmonth == "201312", "churn_rate"].iloc[0]),
#                  5, 0.065, color='sandybrown', fill=False
#                    )
# ax.add_artist(circle)
# ax.xaxis.label.set_visible(False)
# ax.spines['top'].set_visible(False)
# ax.spines['right'].set_visible(False)
# ax.get_legend().remove()

def get_customers(yearmonth):
    year = yearmonth//100
    month = yearmonth-year*100
    date = dt.datetime(year, month, 1)
    
    return ((subs["start_date"] < date) & (date <= subs["end_date"])).sum()

churn["total_customers"] = churn["yearmonth"].apply(get_customers)
churn["churn_rate"] = churn["total_churned"] / churn["total_customers"]
churn["yearmonth"] = churn["yearmonth"].astype(str)

arange = __import__("numpy").arange
Ellipse = __import__("matplotlib").patches.Ellipse
ax = churn.plot(x="yearmonth", y="churn_rate", figsize=(12,6), rot=45, marker=".")
start, end = ax.get_xlim()
ax.get_xticks()
ax.set_xticks(arange(2, end, 3))
ax.set_xticklabels(yearmonths[2::3])
circle = Ellipse((35, churn.loc[churn.yearmonth == "201312", "churn_rate"].iloc[0]),
                 5, 0.065, color='sandybrown', fill=False
                   )
ax.add_artist(circle)
ax.xaxis.label.set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_legend().remove()

In [None]:
# We import everything that we'll use

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
questions = pd.read_csv("2019_questions.csv", parse_dates=["CreationDate"])

questions.info()

In [None]:
questions["Tags"].apply(lambda value: type(value)).unique()

In [None]:

questions.fillna(value={"FavoriteCount": 0}, inplace=True)
questions["FavoriteCount"] = questions["FavoriteCount"].astype(int)
questions.dtypes

In [None]:
questions["Tags"] = questions["Tags"].str.replace("^<|>$", "").str.split("><")
questions.sample(3)

In [None]:
tag_count = dict()

for tags in questions["Tags"]:
    for tag in tags:
        if tag in tag_count:
            tag_count[tag] += 1
        else:
            tag_count[tag] = 1

In [None]:

tag_count = pd.DataFrame.from_dict(tag_count, orient="index")
tag_count.rename(columns={0: "Count"}, inplace=True)
tag_count.head(10)

In [None]:
most_used = tag_count.sort_values(by="Count").tail(20)
most_used

In [None]:
most_used.plot(kind="barh", figsize=(16,8))

In [None]:
tag_view_count = dict()

for index, row in questions.iterrows():
    for tag in row['Tags']:
        if tag in tag_view_count:
            tag_view_count[tag] += row['ViewCount']
        else:
            tag_view_count[tag] = row['ViewCount']
            
tag_view_count = pd.DataFrame.from_dict(tag_view_count, orient="index")
tag_view_count.rename(columns={0: "ViewCount"}, inplace=True)

most_viewed = tag_view_count.sort_values(by="ViewCount").tail(20)

most_viewed.plot(kind="barh", figsize=(16,8))

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2)
fig.set_size_inches((24, 10))
most_used.plot(kind="barh", ax=axes[0], subplots=True)
most_viewed.plot(kind="barh", ax=axes[1], subplots=True)

In [None]:
all_tags = list(tag_count.index)

In [None]:
associations = pd.DataFrame(index=all_tags, columns=all_tags)
associations.iloc[0:4,0:4]

In [None]:
associations.fillna(0, inplace=True)

for tags in questions["Tags"]:
    associations.loc[tags, tags] += 1

In [None]:
relations_most_used = associations.loc[most_used.index, most_used.index]

def style_cells(x):
    helper_df = pd.DataFrame('', index=x.index, columns=x.columns)
    helper_df.loc["time-series", "r"] = "background-color: yellow"
    helper_df.loc["r", "time-series"] = "background-color: yellow"
    for k in range(helper_df.shape[0]):
        helper_df.iloc[k,k] = "color: blue"
    
    return helper_df

relations_most_used.style.apply(style_cells, axis=None)

In [None]:

for i in range(relations_most_used.shape[0]):
    relations_most_used.iloc[i,i] = pd.np.NaN

In [None]:

plt.figure(figsize=(12,8))
sns.heatmap(relations_most_used, cmap="Greens", annot=False)

In [None]:
all_q = pd.read_csv("all_questions.csv", parse_dates=["CreationDate"])

In [None]:
all_q["Tags"] = all_q["Tags"].str.replace("^<|>$", "").str.split("><")

In [None]:
def class_deep_learning(tags):
    for tag in tags:
        if tag in ["lstm", "cnn", "scikit-learn", "tensorflow",
                   "keras", "neural-network", "deep-learning"]:
            return 1
    return 0

In [None]:
all_q["DeepLearning"] = all_q["Tags"].apply(class_deep_learning)

In [None]:
all_q.sample(5)

In [None]:
all_q = all_q[all_q["CreationDate"].dt.year < 2020]

In [None]:
def fetch_quarter(datetime):
    year = str(datetime.year)[-2:]
    quarter = str(((datetime.month-1) // 3) + 1)
    return "{y}Q{q}".format(y=year, q=quarter)

all_q["Quarter"] = all_q["CreationDate"].apply(fetch_quarter)

In [None]:
all_q.head()

In [None]:
quarterly = all_q.groupby('Quarter').agg({"DeepLearning": ['sum', 'size']})
quarterly.columns = ['DeepLearningQuestions', 'TotalQuestions']
quarterly["DeepLearningRate"] = quarterly["DeepLearningQuestions"]\
                                /quarterly["TotalQuestions"]
# The following is done to help with visualizations later.
quarterly.reset_index(inplace=True)
quarterly.sample(5)

In [None]:
ax1 = quarterly.plot(x="Quarter", y="DeepLearningRate",
                    kind="line", linestyle="-", marker="o", color="orange",
                    figsize=(24,12)
                    )

ax2 = quarterly.plot(x="Quarter", y="TotalQuestions",
                     kind="bar", ax=ax1, secondary_y=True, alpha=0.7, rot=45)

for idx, t in quarterly["TotalQuestions"].iteritems():
    ax2.text(idx, t, str(t), ha="center", va="bottom")
xlims = ax1.get_xlim()

ax1.get_legend().remove()

handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(handles=handles1 + handles2,
           labels=labels1 + labels2,
           loc="upper left", prop={"size": 12})


for ax in (ax1, ax2):
    for where in ("top", "right"):
        ax.spines[where].set_visible(False)
        ax.tick_params(right=False, labelright=False)