# Building an ETL Pipeline

As the second part of the predict for Gather, you will need to build a pipeline of functions in python which does the following:

1. Function to connect to twitter and scrapes "Eskom_SA" tweets.
<br>
<br>
2. Cleans/Processes the tweets from the scraped tweets which will create a dataframe with two new columns using the following functions: <br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; a) Hashtag Remover from Analyse Functions
<br>
<br>
3. Functions which connects to your SQL database and uploads the tweets into the table you store the tweets in the database.

In [13]:
# General:
import tweepy           # To consume Twitter's API
import pandas as pd     # To handle data
import numpy as np      # For numerical computation
import json
# For plotting and visualization:
from IPython.display import display
import pyodbc

# Consumer and Access details

Fill in your Consumer and Access details you should have recieved when applying for a Twitter API. 

In [14]:
# Consumer:
CONSUMER_KEY    = ''
CONSUMER_SECRET = ''

# Access:
ACCESS_TOKEN  = ''
ACCESS_SECRET = ''

In [15]:
# API's setup:
def twitter_setup():
    """
    Utility function to setup the Twitter's API
    with access and consumer keys from Twitter.
    """

    # Authentication and access using keys:
    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

    # Return API with authentication:
    api = tweepy.API(auth, timeout=1000)
    return api

In [16]:
api = twitter_setup()
user = api.get_user("Eskom_SA")

print("User details:")
print(user.name)
print(user.description)
print(user.location)

print("Last 20 Followers:")
for follower in user.followers():
    print(follower.name)

User details:
Eskom Hld SOC Ltd

Gauteng, South African   
Last 20 Followers:
Khanyile Jaji
SabiSebe
Tony
Rele
Jomo Tukakgomo
TBOY THUSO
Fundiswa
Blabla
Shaun Roberts
Sabelo Virgo
Mr Kulula
Gosiame
Andries Marakalala
Carol Simango
Rofhiwa
MCLIRAH
Kylo Kotze
Lebo
azwifaneli phanuel ndou
Skeptic Blacksheep


In [17]:
for tweet in api.search(q="@Eskom_SA", lang="en", rpp=1):
    print(tweet)

Status(_api=<tweepy.api.API object at 0x0000024EDE9F72C8>, _json={'created_at': 'Thu Mar 05 09:00:16 +0000 2020', 'id': 1235490300438687744, 'id_str': '1235490300438687744', 'text': 'RT @Eskom_SA: #Eskom #MediaStatement\n\nEskom to institute liquidation proceedings against Trillian as Supreme Court of Appeal dismisses Tril…', 'truncated': False, 'entities': {'hashtags': [{'text': 'Eskom', 'indices': [14, 20]}, {'text': 'MediaStatement', 'indices': [21, 36]}], 'symbols': [], 'user_mentions': [{'screen_name': 'Eskom_SA', 'name': 'Eskom Hld SOC Ltd', 'id': 466420346, 'id_str': '466420346', 'indices': [3, 12]}], 'urls': []}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 854716993878077440, 'id_str': '8547169938780

In [18]:
api = twitter_setup()
tweets = []
dates = []
result = pd.DataFrame()
for tweet in api.search(q="@Eskom_SA -filter:retweets", lang="en", rpp=100, count=40):
    tweets = tweets + [f"{tweet.text}"]
    dates = dates + [f"{tweet.created_at}"]
result['Tweets'] = tweets
result['Date'] = dates
result

Unnamed: 0,Tweets,Date
0,@SmilinGeorge_SA @KeemetsweB @Eskom_SA @ewnupd...,2020-03-05 09:00:23
1,@KeemetsweB @Eskom_SA @ewnupdates @SABCNewsOnl...,2020-03-05 08:59:46
2,@Ngwatlevin_12 @Masaleml @tito_mboweni @EFFSou...,2020-03-05 08:59:20
3,@PhotoColman @James19799836 @AfrikPinocchio @C...,2020-03-05 08:53:19
4,@Em_es_gee @Lindros12 @Eskom_SA That’s good an...,2020-03-05 08:53:04
5,@samkelemaseko @Eskom_SA Malema is busy with r...,2020-03-05 08:52:11
6,"@sir_mehlo @Lindros12 @Eskom_SA Hai kabi, it c...",2020-03-05 08:51:42
7,@gqakhwe @samkelemaseko @SABCNewsOnline @Eskom...,2020-03-05 08:51:38
8,@NazierPaulsen @Our_DA @DDMabuza @Eskom_SA My ...,2020-03-05 08:49:17
9,@OUTASA @Eskom_SA Is he in attendance?,2020-03-05 08:47:39


In [19]:
result['Tweets'][0]

'@SmilinGeorge_SA @KeemetsweB @Eskom_SA @ewnupdates @SABCNewsOnline @News24 @IOL @eNCA @Moneyweb @Fin24 @BBGAfrica… https://t.co/XJnsGDMc0S'

# Function 1:

Write a function which:
- Scrapes _"Eskom_SA"_ tweets from Twitter. 

Function Specifications:
- The function should return a dataframe with the scraped tweets with just the "_Tweets_" and "_Date_". 
- Will take in the ```consumer key,  consumer secret code, access token``` and ```access secret code```.

NOTE:
The dataframe should have the same column names as those in your SQL Database table where you store the tweets.

In [None]:
def twitter_df(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET ):

    # Code Here
    
    return None

# Function 2: Removing hashtags and the municipalities

Write a function which:
- Uses the function you wrote in the Analyse section to extract the hashtags and municipalities into it's own column in a new data frame. 

Function Specifications:
- The function should take in the pandas dataframe you created in Function 1 and return a new pandas dataframe. 

In [None]:
twitter_df(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET )

In [20]:
def extract_municipality_hashtags(df):
    
    """Returns a modified dataframe with two new columns appended, "municipality" and "hashtags". Information is extracted from
    twitter data that includes the municipality and the list of hashtags referred to in each tweet, respectively.
    Input must contain a column named "Tweets".
    
    Parameters
    ----------
    df: dataframe
    
    Returns
    -------
    df_new: modified dataframe 
    """
    
    mun_dict = {'@CityofCTAlerts' : 'Cape Town',
                '@CityPowerJhb' : 'Johannesburg',
                '@eThekwiniM' : 'eThekwini' ,
                '@EMMInfo' : 'Ekurhuleni',
                '@centlecutility' : 'Mangaung',
                '@NMBmunicipality' : 'Nelson Mandela Bay',
                '@CityTshwane' : 'Tshwane'}
    
    if type(df) == type(pd.DataFrame()):
        municipality = []
        data = df
        for i in data["Tweets"]:
            data_str = i.replace(":", "") # Remove ":" from the end of municipality keys and hashtags
            data_str = str.split(data_str) # Splits a sentence/multi-word string by white space into a list
            data_muni = [a for a in data_str if a[0] == "@"] # Add words containing the hashtag to new list
            municipality = municipality + [data_muni]
        for j in range(len(municipality)):
            municipality[j] = [i.replace(i, mun_dict[i]) for i in municipality[j] if i in mun_dict]
        for x in range(len(municipality)):
            if municipality[x] == []:
                municipality[x] = (np.nan)

        df_muni = pd.DataFrame({"municipality": municipality})
        df = df.join(df_muni)
    
        data_subset = df
        hashtags = []
        for j, k in data_subset.iterrows(): # Iterate over pd df
            data_subset_str = data_subset.iloc[j,0]
            data_subset_str = str.split(data_subset_str) # Splits a sentence/multi-word string by white space into a list
            data_subset_hashtags = [a for a in data_subset_str if a[0] == "#"] # Add words containing the hashtag to new list
            data_subset_hashtags = list(map(lambda b: str.lower(b), data_subset_hashtags)) # Convert all hashtags in list to lower case
            if data_subset_hashtags == []:
                data_subset_hashtags = (np.nan) # Use () instead of [], resulting nan must not have square brackets in solution
            hashtags = hashtags + [data_subset_hashtags]

        df = data_subset
        df2 = pd.DataFrame({"hashtags": hashtags})
        df = df.join(df2)
        df_new = df
    
    else:
        print("Error: input must be a data frame.")
    return df_new

In [21]:
twitter_df = extract_municipality_hashtags(result)

In [45]:
twitter_df.head()

Unnamed: 0,Tweets,Date,municipality,hashtags
0,@SmilinGeorge_SA @KeemetsweB @Eskom_SA @ewnupd...,2020-03-05 09:00:23,,
1,@KeemetsweB @Eskom_SA @ewnupdates @SABCNewsOnl...,2020-03-05 08:59:46,,
2,@Ngwatlevin_12 @Masaleml @tito_mboweni @EFFSou...,2020-03-05 08:59:20,,
3,@PhotoColman @James19799836 @AfrikPinocchio @C...,2020-03-05 08:53:19,,
4,@Em_es_gee @Lindros12 @Eskom_SA That’s good an...,2020-03-05 08:53:04,,


In [46]:
twitter_df = twitter_df[['Tweets', 'Date']]

In [47]:
twitter_df.head()

Unnamed: 0,Tweets,Date
0,@SmilinGeorge_SA @KeemetsweB @Eskom_SA @ewnupd...,2020-03-05 09:00:23
1,@KeemetsweB @Eskom_SA @ewnupdates @SABCNewsOnl...,2020-03-05 08:59:46
2,@Ngwatlevin_12 @Masaleml @tito_mboweni @EFFSou...,2020-03-05 08:59:20
3,@PhotoColman @James19799836 @AfrikPinocchio @C...,2020-03-05 08:53:19
4,@Em_es_gee @Lindros12 @Eskom_SA That’s good an...,2020-03-05 08:53:04


# Function 3: Updating SQL Database with pyODBC

Write a function which:
- Connects and updates your SQL database. 

Function Specifications:
- The function should take in a pandas dataframe created in Function 2. 
- Connect to your SQL database.
- Update the table you store your tweets in.
- Not return any output.

In [48]:
conn = pyodbc.connect(driver='{SQL Server}',
                      host='EDSA-PGLBGKO\SQLEXPRESS',
                      database='gather_eskom',
                      UID='sa',
                      PWD='edsa@2020')

In [49]:
twitter_df['Tweets'][0]

'@SmilinGeorge_SA @KeemetsweB @Eskom_SA @ewnupdates @SABCNewsOnline @News24 @IOL @eNCA @Moneyweb @Fin24 @BBGAfrica… https://t.co/XJnsGDMc0S'

In [52]:
cursor = conn.cursor()
conn.close()

In [None]:
cursor.execute(
    """
    CREATE TABLE twts (
        tweet VARCHAR(300),
        date DATETIME
    )
    """
)

In [42]:
def pyodbc_twitter(connection, df, twitter_table):
    """Extracts a dataframe of tweets and connects and updates the data in your local SQL database

    Parameters:
    -----------
    connection: SQL connection settings

    df: DataFrame of tweets with their timestamp

    twitter_table: An already existing twitter SQL database 

    Examples:
    --------
    >>> conn = pyodbc.connect(driver='{SQL Server}',
                      host='your_server_name',
                      database='your_database_name', 
                      trusted_connection='tcon',
                      UID='your_user_name',
                      autocommit=True)
    
    >>> df = 
    """
    cursor = connection.cursor()
    
    for i in range(len(df.index)):
        tweet_text = df['Tweets'][i]
        tweet_date = df['Date'][i]
        tweet_municipality = df['municipality'][i]
        tweet_hashtag = df['hashtags'][i]
        
        try:
            cursor.execute(
                f"""
                INSERT INTO {twitter_table}
                VALUES ('{tweet_text}', {tweet_date}, '{tweet_municipality}', '{tweet_hashtag}')
                """
            )
        
        except ProgrammingError:
            pass

    return None 

In [43]:
pyodbc_twitter(conn, twitter_df, tweets)

NameError: name 'ProgrammingError' is not defined