# Importing the necessary packages

In [None]:
# Import the necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

# Import the data

In [None]:
# Importing the csv file created with the data preparation jupyter notebook.
Rotterdam = pd.read_csv('2015.csv', sep= ',', low_memory = False, lineterminator='\n')
RotterdamFlickr = pd.read_csv('FlickrTotal.csv', sep= ',', low_memory = False, lineterminator='\n')

In [None]:
Rotterdam.dtypes

# Descriptive statistics

In [None]:
# Get the number of unique users.
Rotterdam['user_id'].nunique()

In [None]:
# Average number of tweets per user
len(Rotterdam.index) / Rotterdam['user_id'].nunique()

In [None]:
Rotterdam.shape

In [None]:
# Distribution of tweets per user.
TweetsPerUser = Rotterdam.groupby('user_id').count()

In [None]:
# Boxplot of the distribution --> lots of outliers
%matplotlib inline
sns.boxplot(x=TweetsPerUser['item_number'])

In [None]:
# Dropping outliers
TweetsPerUserNoOutliers = TweetsPerUser[TweetsPerUser['item_number'] < 20]

In [None]:
# Dropping values with only 1 tweets --> those are not suited for my analysis.
TweetsPerUserNoOutliers = TweetsPerUserNoOutliers[TweetsPerUserNoOutliers['item_number'] > 2]

In [None]:
# Boxplot of the distribution without outliers
%matplotlib inline
sns.boxplot(x=TweetsPerUserNoOutliers['item_number'])

In [None]:
Rotterdam.dtypes

In [None]:
# Convert the time to datetime objects.
Rotterdam['tweet_date'] = pd.to_datetime(Rotterdam['created_at_x'])

In [None]:
# Create two new columns, one for tweet time the other for month.
Rotterdam['tweet_hour'] = Rotterdam['tweet_date'].dt.hour
Rotterdam['tweet_month'] = Rotterdam['tweet_date'].dt.month

In [None]:
# Get tweets per month
TweetsPerMonth = Rotterdam.groupby('tweet_month', as_index=False).count()

In [None]:
TweetsPerMonth

In [None]:
# Plot the number of tweets per month --> enormous increase.
sns.set(style='darkgrid')
sns.lineplot(x='tweet_month', y='item_number', data=TweetsPerMonth)

In [None]:
# Get tweet distribution per day
TweetsPerHour = Rotterdam.groupby('tweet_hour', as_index=False).count()

In [None]:
# Plot the distribution of tweets per day
sns.set(style='darkgrid')
sns.lineplot(x='tweet_hour', y='item_number', data=TweetsPerHour)

# Identifying tourists (Twitter)

In [None]:
# Get all the unique users and put them in a list
UniqueUsers = Rotterdam['user_id'].unique()

In [None]:
# Create an empty dictionary to store the tourist identification information.
TouristInfoDict = dict()

In [None]:
# For each user, identify whether the user is a tourist based on the time difference between the first and last tweet
# of that user. Update the dictionary accordingly.
for i in UniqueUsers:
    
    # Create boolean to indicate whether someone is a tourist.
    isTourist = False
    
    # Get all the tweets for this particular user.
    TweetsForThisUser = Rotterdam[Rotterdam['user_id'] == i]
    numberOfTweets = len(TweetsForThisUser)
    
    # Only continue when there are more than 1 tweets.
    if numberOfTweets > 1:
        
        # Sort the tweets and get the first and last tweet of a user.
        TweetsForThisUser = TweetsForThisUser.sort_values(by='tweet_date')
        FirstAndLast = TweetsForThisUser.iloc[[0, -1]]
        
        # Get the difference in days between the first and last tweet.
        daysDelta = (FirstAndLast['tweet_date'].iloc[0] - FirstAndLast['tweet_date'].iloc[1]).days
        
        if abs(daysDelta) < 11:
            isTourist = True
        else:
            isTourist = False
    
    else:
        isTourist = False
    
    # Add the user to the dictionary
    TouristInfoDict.update( {i : isTourist} )

In [None]:
# Transform the dictionary into a dataframe and rename the columns.
TouristInfo = pd.DataFrame(list(TouristInfoDict.items()))
TouristInfo = TouristInfo.rename(columns={0: "user_id", 1: "is_tourist"})

In [None]:
# Statistics on the amount of tourists identified.
TouristInfo['is_tourist'].value_counts()

In [None]:
# Merge with the original dataset
Rotterdam_TouristInfo = pd.merge(Rotterdam, TouristInfo, on='user_id', how='left')

In [None]:
# Dataset with locals
Rotterdam_Locals = Rotterdam_TouristInfo[Rotterdam_TouristInfo['is_tourist'] == False]

In [None]:
# Dataset with tourists
Rotterdam_Tourists = Rotterdam_TouristInfo[Rotterdam_TouristInfo['is_tourist'] == True]

In [None]:
# Export to CSV files for later analysis.
Rotterdam_Locals.to_csv('CSV/2015_Locals.csv', index=False)
Rotterdam_Tourists.to_csv('CSV/2015_Tourists.csv', index=False)

# Identifying Tourists (Flickr)

In [None]:
# Get an idea of the data.
RotterdamFlickr.head()

In [None]:
# postedDate needs to be converted to datetime.
RotterdamFlickr.dtypes

In [None]:
# Create new datetime column
RotterdamFlickr['postedTime'] = RotterdamFlickr['postedDate'].map(lambda x: datetime.datetime.fromtimestamp(x))

In [None]:
RotterdamFlickr.dtypes

In [None]:
# Get all the unique users and put them in a list
UniqueUsersFlickr = RotterdamFlickr['userID'].unique()

In [None]:
# Create an empty dictionary to store the tourist identification information.
TouristInfoDictFlickr = dict()

In [None]:
# For each user, identify whether the user is a tourist based on the time difference between the first and last post
# of that user. Update the dictionary accordingly.
for i in UniqueUsersFlickr:
    
    # Create boolean to indicate whether someone is a tourist.
    isTourist = False
    
    # Get all the posts for this particular user.
    PostsForThisUser = RotterdamFlickr[RotterdamFlickr['userID'] == i]
    numberOfPosts = len(PostsForThisUser)
    
    # Only continue when there are more than 1 posts.
    if numberOfPosts > 1:
        
        # Sort the tweets and get the first and last post of a user.
        PostsForThisUser = PostsForThisUser.sort_values(by='postedTime')
        FirstAndLast = PostsForThisUser.iloc[[0, -1]]
        
        # Get the difference in days between the first and last post.
        daysDelta = (FirstAndLast['postedTime'].iloc[0] - FirstAndLast['postedTime'].iloc[1]).days
        
        if abs(daysDelta) < 11:
            isTourist = True
        else:
            isTourist = False
    
    else:
        isTourist = False
    
    # Add the user to the dictionary
    TouristInfoDictFlickr.update( {i : isTourist} )

In [None]:
# Transform the dictionary into a dataframe and rename the columns.
TouristInfoFlickr = pd.DataFrame(list(TouristInfoDictFlickr.items()))
TouristInfoFlickr = TouristInfoFlickr.rename(columns={0: "userID", 1: "is_tourist"})

In [None]:
# Statistics on the amount of tourists identified.
TouristInfoFlickr['is_tourist'].value_counts()

In [None]:
# Merge with the original dataset
Rotterdam_TouristInfoFlickr = pd.merge(RotterdamFlickr, TouristInfoFlickr, on='userID', how='left')

In [None]:
# Dataset with locals
Rotterdam_LocalsFlickr = Rotterdam_TouristInfoFlickr[Rotterdam_TouristInfoFlickr['is_tourist'] == False]

In [None]:
# Dataset with tourists
Rotterdam_TouristsFlickr = Rotterdam_TouristInfoFlickr[Rotterdam_TouristInfoFlickr['is_tourist'] == True]

In [None]:
Rotterdam_LocalsFlickr['userID'].nunique()

In [None]:
# Export to CSV files for later analysis.
Rotterdam_LocalsFlickr.to_csv('Flickr_Locals.csv', index=False)
Rotterdam_TouristsFlickr.to_csv('Flickr_Tourists.csv', index=False)