In [2]:
# Add user specific python libraries to path
import sys
sys.path.insert(0, "/home/smehra/local-packages")

In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import time
from datetime import timedelta  
from datetime import date
from datetime import datetime


In [4]:
import os
os.environ["SPARK_CONF_DIR"] = "/data/tmp/spark/conf"

In [5]:
import pyspark
import random

from pyspark.sql import SparkSession
from pyspark.sql import HiveContext
from pyspark.sql import functions as F

config = pyspark.SparkConf().setAll([('spark.ui.port', 4050), 
                                     ('spark.ui.enabled', True),
                                     
                                     # if running in local mode, driver will be only executor
                                     # hence, give driver as much memory as possible if running in local mode
                                     ('spark.driver.memory','50g'), 
                                     
                                     # set up executor config if running in cluster or client mode
                                     #('spark.executor.instances', '5'), 
                                     #('spark.executor.cores', '5'), 
                                     #('spark.executor.memory', '5g'), 
                                     #('spark.executor.memoryOverhead', '500m'),
                                     
                                     # more partitions means smaller partition size per task
                                     # hence, would reduce memory load
                                     ('spark.sql.shuffle.partitions', '1000'),
                                     
                                     # increase max result size if you are "collecting" big dataset 
                                     # driver will need more memory to collect
                                     ('spark.driver.maxResultSize', '2g'),
                                     
                                     # set location spark should use for temporary data
                                     ('spark.local.dir', '/data/tmp/smehra/tmp'),
                                     # Set location of hive database
                                     ('spark.sql.warehouse.dir', '/data/tmp/hive_warehouse'),
                                     # Add mysql connector jar to use mysql as metastore service
                                     ('spark.jars', '/data/tmp/spark/jars/mysql-connector-java-5.1.30-bin.jar'),
                                    
                                     # KryoSerializer is faster and more compact than the Java default serializer.
                                     ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'),
                                     ('spark.kryoserializer.buffer.max.mb', '2000'),
                                     
                                     # G1GC overcomes the latency and throughput limitations with the old garbage collectors.
                                     ('spark.executor.extraJavaOptions','-XX:+UseG1GC')])

spark = SparkSession.builder \
        .enableHiveSupport() \
        .config(conf=config) \
        .master("local[30]") \
        .appName("afgh_project_smehra_hive_setup") \
        .getOrCreate()

# Get the Hive Context
hive = HiveContext(spark.sparkContext)

spark.sparkContext._conf.getAll()


[(u'spark.driver.memory', u'50g'),
 (u'spark.repl.local.jars',
  u'file:///data/tmp/spark/jars/mysql-connector-java-5.1.30-bin.jar'),
 (u'spark.sql.shuffle.partitions', u'1000'),
 (u'spark.app.id', u'local-1585092384719'),
 (u'spark.jars', u'/data/tmp/spark/jars/mysql-connector-java-5.1.30-bin.jar'),
 (u'spark.app.name', u'afgh_project_smehra_hive_setup'),
 (u'spark.master', u'local[30]'),
 (u'spark.executor.extraJavaOptions', u'-XX:+UseG1GC'),
 (u'spark.executor.id', u'driver'),
 (u'spark.driver.port', u'38473'),
 (u'spark.local.dir', u'/data/tmp/smehra/tmp'),
 (u'spark.serializer', u'org.apache.spark.serializer.KryoSerializer'),
 (u'spark.ui.port', u'4050'),
 (u'spark.kryoserializer.buffer.max.mb', u'2000'),
 (u'spark.sql.warehouse.dir', u'/data/tmp/hive_warehouse'),
 (u'spark.sql.catalogImplementation', u'hive'),
 (u'spark.rdd.compress', u'True'),
 (u'spark.serializer.objectStreamReset', u'100'),
 (u'spark.driver.maxResultSize', u'2g'),
 (u'spark.submit.deployMode', u'client'),
 (u'

##  Number of users per district per day

### Detected Segments based user counts

In [6]:
user_segments_per_district_per_day = pd.read_csv('/data/tmp/smehra/aggregated_data/migration_detector_output_data/district_day_metrics.csv')
user_segments_per_district_per_day.drop(columns = ['percentage_migrated'], inplace = True)
user_segments_per_district_per_day.columns = ['district_id', 'day_series', 'user_count']
user_segments_per_district_per_day.head()


Unnamed: 0,district_id,day_series,user_count
0,101,1,86936.0
1,101,2,100962.0
2,101,3,107369.0
3,101,4,113819.0
4,101,5,119551.0


In [7]:
import random

random_districts = random.sample(user_segments_per_district_per_day.district_id.unique().tolist(), 10)
random_districts = sorted([402, 1201, 816, 2902, 3203, 1801, 708, 614, 3202, 401])
random_districts

[401, 402, 614, 708, 816, 1201, 1801, 2902, 3202, 3203]

In [None]:
fig, ax = plt.subplots()

for district in random_districts:
    
    district_data = user_segments_per_district_per_day[(user_segments_per_district_per_day.district_id == district)]
    ax.plot(district_data.day_series, district_data.user_count)
    
#ax.set_xticks([1,200,400,600,800,1000,1200])
    


### Daily Modal based user counts

In [42]:
user_daily_modal_districts = hive.sql('select * from afghanistan.user_daily_modal_districts_wide')

In [43]:
district_ids = [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 201, 202, 203, 204, 205, 206, 207, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 401, 402, 403, 404, 405, 406, 407, 408, 409, 501, 502, 503, 504, 505, 506, 507, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 901, 902, 903, 904, 905, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 1401, 1402, 1403, 1404, 1405, 1406, 1407, 1501, 1502, 1503, 1504, 1505, 1506, 1507, 1601, 1602, 1603, 1604, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614, 1615, 1701, 1702, 1703, 1704, 1705, 1706, 1707, 1708, 1709, 1710, 1711, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810, 1811, 1812, 1813, 1814, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2111, 2201, 2202, 2203, 2204, 2205, 2301, 2302, 2303, 2304, 2305, 2306, 2307, 2308, 2309, 2310, 2311, 2312, 2313, 2401, 2402, 2403, 2404, 2405, 2406, 2407, 2408, 2409, 2410, 2411, 2412, 2413, 2414, 2415, 2416, 2501, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 2509, 2510, 2511, 2601, 2603, 2604, 2604, 2605, 2701, 2702, 2703, 2704, 2705, 2706, 2707, 2708, 2709, 2710, 2801, 2802, 2803, 2804, 2805, 2806, 2807, 2901, 2902, 2903, 2904, 2905, 2906, 2907, 2908, 2909, 2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919, 3001, 3002, 3003, 3004, 3005, 3006, 3007, 3008, 3101, 3102, 3103, 3104, 3105, 3106, 3107, 3201, 3202, 3203, 3204, 3205, 3206, 3207, 3208, 3209, 3210, 3211, 3212, 3213, 3301, 3302, 3303, 3304, 3305, 3306, 3307, 3401, 3402, 3403, 3404, 3405, 3406, 3407, 3408, 3409]

# create an empty dataframe where we will store per district per day counts
per_district_per_date_counts = pd.DataFrame(columns = ['district_id']) 
per_district_per_date_counts['district_id'] = district_ids
per_district_per_date_counts.head()


Unnamed: 0,district_id
0,101
1,102
2,103
3,104
4,105


In [None]:
# iterate through each day series
for daySeries in np.arange(1, 1462, 1).tolist():
    
    print('getting data for day series ' + str(daySeries))
    
    # get district counts for selected daySeries
    counts_for_selected_day = user_daily_modal_districts.groupBy(str(daySeries)).count().toPandas()
    counts_for_selected_day.columns = ['district_id', str(daySeries)]
    counts_for_selected_day.dropna(inplace = True)
    counts_for_selected_day.district_id = counts_for_selected_day.district_id.astype(int)

    # merge counts with results dataset
    per_district_per_date_counts = per_district_per_date_counts.merge(counts_for_selected_day, on = 'district_id', how = 'left')

per_district_per_date_counts.head()


In [45]:
user_count_per_district_per_day = per_district_per_date_counts.copy()

user_count_per_district_per_day.index = per_district_per_date_counts.district_id
user_count_per_district_per_day.drop(columns = ['district_id'], inplace = True)

user_count_per_district_per_day = user_count_per_district_per_day.stack()
user_count_per_district_per_day = user_count_per_district_per_day.reset_index()
user_count_per_district_per_day.columns = ['district_id', 'day_series', 'user_count']
user_count_per_district_per_day.day_series = user_count_per_district_per_day.day_series.astype(int)

user_count_per_district_per_day.sort_values(by=['district_id', 'day_series'], inplace = True)
user_count_per_district_per_day.head()


Unnamed: 0,district_id,day_series,user_count
0,101,1,331060.0
1,101,2,332718.0
2,101,3,332701.0
3,101,4,332953.0
4,101,5,306451.0


In [46]:
import random

random_districts = random.sample(user_count_per_district_per_day.district_id.unique().tolist(), 10)
random_districts = sorted([402, 1201, 816, 2902, 3203, 1801, 708, 614, 3202, 401])
random_districts

[401, 402, 614, 708, 816, 1201, 1801, 2902, 3202, 3203]

In [None]:
fig, ax = plt.subplots()

for district in random_districts:
    
    district_data = user_count_per_district_per_day[(user_count_per_district_per_day.district_id == district)]
    ax.plot(district_data.day_series, district_data.user_count)
    
#ax.set_xticks([1,200,400,600,800,1000,1200])
    
