# Download libraries

In [1]:
!pip install pytz
!pip install tweepy
!pip install statsmodels
!pip install plotly
!pip install wordcloud
!pip install textblob
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


# Importing libraries

In [2]:
import pandas as pd
import os
import csv
import datetime
import time
import sys
from pytz import timezone
import tweepy
import json
import statsmodels.api as sm
import plotly.graph_objects as go
from datetime import datetime, timedelta
from pandas.errors import EmptyDataError
import logging
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from textblob import TextBlob

# Loading the credentials

In [3]:
# Load the Twitter API credentials from the config file
with open('config.json', 'r') as f:
    config = json.load(f)
    consumer_key = config['consumer_key']
    consumer_secret = config['consumer_secret']
    access_token = config['access_token']
    access_token_secret = config['access_token_secret']
    
# Verify the Twitter API credentials
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
try:
    api = tweepy.API(auth, wait_on_rate_limit=True)
    user = api.verify_credentials()
    print("Twitter API connection successful.")
except tweepy.error.TweepError as e:
    print("Error: Failed to verify Twitter API credentials.")
    print(e)

Twitter API connection successful.


# Disabling warnings

In [4]:
import warnings
warnings.filterwarnings("ignore")

# Downloading and save twitter data
The free tier of the twitter API holds the limitation of:</br>
<b>**14 Day tweet history limit </br>
**1500 tweet request limit per 900 seconds circ. </b>

In [None]:
# Get today's date
today = datetime.now().date()

# Create a log file with today's date in the name
log_file = f"TwitterAPI_{today}.log"
logging.basicConfig(filename=log_file, level=logging.INFO)

# Define the topic and initial date range
topic = "(ios OR apple OR AAPL OR iphone OR ipad)"
start_date = today - timedelta(days=91)

# Create a loop to run for 91 days
for _ in range(91):
    # Calculate the end date for the current iteration
    end_date = start_date + timedelta(days=1)
    
    # Format the dates as strings
    start_date_str = start_date.strftime("%Y-%m-%d")
    end_date_str = end_date.strftime("%Y-%m-%d")
    
    # Define the search query with the current date range
    query = f"{topic} until:{end_date_str} since:{start_date_str}"
    
    # Fetch tweets on the specified topic
    try:
        tweets = []
        for tweet in tweepy.Cursor(api.search_tweets, q=query, lang='en', tweet_mode='extended').items(3000):
            tweets.append({
                'Date': tweet.created_at.date(),
                'Tweet': tweet.full_text
            })
        
        if len(tweets) > 0:
            msg = "Tweets downloaded successfully for the date range: {} to {}"
            logging.info(msg.format(start_date_str, end_date_str))
            
            # Convert the tweets list into a DataFrame
            df_new = pd.DataFrame(tweets)
            
            # Check if the CSV file already exists
            if os.path.isfile('tweets.csv'):
                # Read the existing data from the CSV file
                try:
                    df_existing = pd.read_csv('tweets.csv')
                    
                    # Check if the existing DataFrame has any columns
                    if df_existing.columns.empty:
                        # Handle the case when the CSV file is empty
                        df_existing = pd.DataFrame()
                        
                except pd.errors.EmptyDataError:
                    # Handle the case when the CSV file is empty
                    df_existing = pd.DataFrame()
                
                # Check if the existing DataFrame is empty
                if df_existing.empty:
                    # Save the new DataFrame to a new CSV file
                    df_new.to_csv('tweets.csv', index=False)
                    logging.info("New CSV file created with the downloaded tweets.")
                else:
                    # Concatenate the existing and new data
                    df_combined = pd.concat([df_existing, df_new], ignore_index=True)
                    
                    # Save the combined DataFrame to the CSV file
                    df_combined.to_csv('tweets.csv', index=False)
                    logging.info("Tweets appended to the existing CSV file.")
            else:
                # Save the new DataFrame to a new CSV file
                df_new.to_csv('tweets.csv', index=False)
                logging.info("New CSV file created with the downloaded tweets.")
        else:
            msg = "No tweets found for the date range: {} to {}"
            logging.info(msg.format(start_date_str, end_date_str))
            
    except tweepy.TweepyException as e:
        if e.api_code == 88:
            # Rate limit reached, wait for the specified duration
            wait_time = int(e.response.headers['Retry-After'])
            msg = "Rate limit reached. Sleeping for: {} seconds."
            logging.info(msg.format(wait_time))
            time.sleep(wait_time)
        logging.error("Error: Failed to download tweets.")
        logging.error(e)
    
    # Update the start date for the next iteration
    start_date = end_date

# Log data

In [None]:
# Read the log file into a DataFrame
log_df = pd.read_csv("log_file", sep=":", names=["Timestamp", "Log Message"])
# Display the log DataFrame
log_df

# Cleaning tweets

In [None]:
# Load the tweets data from the CSV file
df_tweets = pd.read_csv('tweets.csv')

# Function to clean a single tweet
def clean_tweet(tweet):
    # Remove unnecessary characters and links
    cleaned_tweet = re.sub(r'[^\w\s]', '', tweet)
    cleaned_tweet = re.sub(r'http\S+|www\S+', '', cleaned_tweet)
    
    # Remove Twitter usernames
    cleaned_tweet = re.sub(r'@[^\s]+', '', cleaned_tweet)
    
    # Remove non-English words
    cleaned_words = []
    english_words = set(words.words())
    for word in cleaned_tweet.split():
        if word.lower() in english_words:
            cleaned_words.append(word)
    cleaned_tweet = ' '.join(cleaned_words)
    
    return cleaned_tweet

# Clean the tweets column
df_tweets['Tweet'] = df_tweets['Tweet'].apply(clean_tweet)

# Remove rows with empty tweet values
df_tweets = df_tweets.dropna(subset=['Tweet'])

# Save the cleaned tweets back to the CSV file
df_tweets.to_csv('tweets.csv', index=False)

# Tweet EDA

In [None]:
# Read the tweets data from the CSV file
df_tweets = pd.read_csv('tweets.csv')
# Perform basic exploratory data analysis (EDA)
print("Number of tweets:", len(df_tweets))
print("Columns:", df_tweets.columns)
print("Sample tweets:")
print(df_tweets.head())

In [None]:
# Perform sentiment analysis using Vader SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
df_tweets['Sentiment'] = df_tweets['Tweet'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Visualize sentiment distribution
plt.figure(figsize=(8, 6))
sns.histplot(df_tweets['Sentiment'], bins=30, kde=True)
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Generate word cloud of most frequent words
stop_words = set(stopwords.words('english'))
wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(' '.join(df_tweets['Tweet']))

plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Most Frequent Words in Tweets')
plt.axis('off')
plt.show()

# Sentiment analysis

In [None]:
# Load the tweets from the CSV file
df = pd.read_csv('tweets.csv')

# Perform sentiment analysis using TextBlob
df['sentiment'] = df['Tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Classify sentiment as positive, negative, or neutral
df['sentiment_label'] = df['sentiment'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')

# Save the updated DataFrame to CSV
df.to_csv('tweets_sentiment.csv', index=False)

In [None]:
# Convert forecast data to strings
forecast_1w_str = forecast_1w.to_string(header=False)
forecast_1m_str = forecast_1m.to_string(header=False)
forecast_3m_str = forecast_3m.to_string(header=False)
# Print the forecast data
print("1 Week Forecast:")
print(forecast_1w_str)
print("1 Month Forecast:")
print(forecast_1m_str)
print("3 Months Forecast:")
print(forecast_3m_str)

In [None]:
# Read the tweets data from the CSV file
df_tweets = pd.read_csv('tweets_sentiment.csv')

# Perform basic exploratory data analysis (EDA)
print("Number of tweets:", len(df_tweets))
print("Columns:", df_tweets.columns)
print("Sample tweets:")
print(df_tweets.head())

# Visualize sentiment distribution by sentiment category
plt.figure(figsize=(8, 6))
sns.countplot(x='Sentiment', hue='Category', data=df_tweets)
plt.title('Sentiment Distribution by Category')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.legend(title='Category')
plt.show()

In [None]:
# Visualize sentiment distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Sentiment', data=df_tweets)
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
# Convert the 'Date' column to datetime
df_tweets['Date'] = pd.to_datetime(df_tweets['Date'])

# Group by date and sentiment to calculate counts
df_sentiment_counts = df_tweets.groupby(['Date', 'Sentiment']).size().reset_index(name='Count')

# Pivot the data to have sentiment types as columns
df_sentiment_pivot = df_sentiment_counts.pivot(index='Date', columns='Sentiment', values='Count')

# Plot the sentiment distribution over time
plt.figure(figsize=(12, 6))
sns.lineplot(data=df_sentiment_pivot, dashes=False)
plt.title('Sentiment Distribution Over Time')
plt.xlabel('Date')
plt.ylabel('Count')
plt.legend(title='Sentiment')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Tokenize the tweets into words
tokenized_words = [word.lower() for tweet in df_tweets['Tweet'] for word in word_tokenize(tweet)]

# Calculate the frequency distribution of words
freq_dist = FreqDist(tokenized_words)
most_common = freq_dist.most_common(20)

# Plot the most frequent words by sentiment
plt.figure(figsize=(10, 6))
sns.barplot(x='Count', y='Word', hue='Sentiment', data=pd.DataFrame(most_common, columns=['Word', 'Count']))
plt.title('Most Frequent Words by Sentiment')
plt.xlabel('Count')
plt.ylabel('Word')
plt.legend(title='Sentiment')
plt.show()

# Time series forecast of the sentiment

In [None]:
# Load the tweet sentiment data from the CSV file
df = pd.read_csv('tweets_sentiment.csv', parse_dates=['Date'])

# Set the 'Date' column as the index
df.set_index('Date', inplace=True)

try:
    # Fit an ARIMA model to the sentiment data
    model = sm.tsa.ARIMA(df['sentiment'], order=(1, 0, 1), trend='c').fit()

    # Generate predictions for the next 1 week, 1 month, and 3 months
    forecast_1w = model.predict(start=len(df), end=len(df) + 6, dynamic=False)
    forecast_1m = model.predict(start=len(df), end=len(df) + 30, dynamic=False)
    forecast_3m = model.predict(start=len(df), end=len(df) + 90, dynamic=False)

    # Create Plotly figure
    fig = go.Figure()

    # Add actual sentiment data
    fig.add_trace(go.Scatter(x=df.index, y=df['sentiment'], name='Actual'))

    # Add forecasted sentiment data
    forecast_dates_1w = pd.date_range(start=df.index[-1], periods=7)[1:]
    forecast_dates_1m = pd.date_range(start=df.index[-1], periods=31)[1:]
    forecast_dates_3m = pd.date_range(start=df.index[-1], periods=91)[1:]
    fig.add_trace(go.Scatter(x=forecast_dates_1w, y=forecast_1w, name='1 Week Forecast'))
    fig.add_trace(go.Scatter(x=forecast_dates_1m, y=forecast_1m, name='1 Month Forecast'))
    fig.add_trace(go.Scatter(x=forecast_dates_3m, y=forecast_3m, name='3 Months Forecast'))

    # Update layout
    fig.update_layout(
        title='Time Series Forecast of Sentiment',
        xaxis_title='Date',
        yaxis_title='Sentiment',
        legend_title='Forecast',
        hovermode='x unified'
    )

    # Show the interactive Plotly graph
    fig.show()

except ValueError as e:
    print("Error: Failed to make time series forecast.")
    print(e)