In [1]:
# Import required libraries
import boto3
import sagemaker

import re
import pandas as pd

from sagemaker import get_execution_role

In [2]:
# Connection
def access_data():
    region = boto3.Session().region_name
    smclient = boto3.Session().client('sagemaker')
    s3client = boto3.client('s3')

    role = get_execution_role()
    sess = sagemaker.Session()

    bucket_name = "twitter-analytics-database"

    response = s3client.list_objects(Bucket=bucket_name)['Contents']
    files = []
    for file in response:
        print(f"file_name: {file['Key']}, size: {file['Size']}")
        files.append("s3://twitter-analytics-database/"+file['Key'])
    return files

In [3]:
# Tweets data
def get_tweets(data_files):
    df_ = pd.concat(map(pd.read_csv, [x for x in data_files]), ignore_index=True) # Combining the CSV files
    df_drop = df_.drop_duplicates(subset='id', keep='first') # Droping duplicates
    tweets_df = df_drop['tweet'] # Tweets dataframe
    return tweets_df

In [4]:
# Tweet Preprocessing
def preprocess_tweet(tweet):
    tweet = re.sub(r'(\s)?@\w+', r'\1', tweet) # Removing mentions: @name
    tweet = re.sub(r'https?://\S+', '', tweet) # Removing URLs
    return tweet

In [5]:
data_files = access_data()
tweets_text = get_tweets(data_files)
tweets_text = tweets_text.apply(preprocess_tweet)

file_name: nytimes.csv, size: 19577
file_name: nytimes2022-08-26 20:44:19.381548.csv, size: 19760
file_name: nytimes2022-08-26 20:45:44.462263.csv, size: 20833
file_name: nytimes2022-08-26 20:48:37.282261.csv, size: 19780
file_name: nytimes2022-08-26 20:48:41.401947.csv, size: 19739
file_name: nytimes2022-08-26 21:14:19.243230.csv, size: 20413


In [6]:
tweets_text

0       Many, if not all, of whom have gone through a...
1                                                       
2         A sort of mutiny The Constelicit ion Incident 
3      Now lets talk about how many people are there ...
4                                               RT :    
                             ...                        
595     It’s time he’s arrested and to hell with the ...
596    RT : Lazy, smug, elitist and totally wrong: NY...
597    RT : "The scandal has echoes of the dark old d...
598               The middle class will not pay for this
599    RT : In the affidavit for the search of Donald...
Name: tweet, Length: 463, dtype: object