Exercise 1

In [6]:
# (a) Structured dataset — Product Sales

import pandas as pd
from google.colab import files

# Upload Product Sales dataset (sales_statsfinal.csv)
uploaded = files.upload()

sales_df = pd.read_csv("sales_statsfinal.csv")

print("Sales dataset sample:")
print(sales_df.head())
print(sales_df.info())

# Basic analysis adapted to columns (Q-P1..Q-P4 are quantities, S-P1..S-P4 are sales)
# Total revenue per product (sum of sales)
revenue_cols = ["S-P1", "S-P2", "S-P3", "S-P4"]
quantity_cols = ["Q-P1", "Q-P2", "Q-P3", "Q-P4"]

sales_df["Total_Revenue"] = sales_df[revenue_cols].sum(axis=1)
sales_df["Total_Quantity"] = sales_df[quantity_cols].sum(axis=1)

print("Total revenue across dataset:", sales_df["Total_Revenue"].sum())
print("Average daily revenue:", sales_df["Total_Revenue"].mean())

# Top products by total quantity sold
for i, col in enumerate(quantity_cols, start=1):
    print(f"Total quantity sold for Product {i}: {sales_df[col].sum()}")

# Top products by total revenue
for i, col in enumerate(revenue_cols, start=1):
    print(f"Total revenue for Product {i}: {sales_df[col].sum()}")



Saving sales_statsfinal.csv to sales_statsfinal (3).csv
Sales dataset sample:
   Unnamed: 0        Date  Q-P1  Q-P2  Q-P3  Q-P4      S-P1      S-P2  \
0           0  13-06-2010  5422  3725   576   907  17187.74  23616.50   
1           1  14-06-2010  7047   779  3578  1574  22338.99   4938.86   
2           2  15-06-2010  1572  2082   595  1145   4983.24  13199.88   
3           3  16-06-2010  5657  2399  3140  1672  17932.69  15209.66   
4           4  17-06-2010  3668  3207  2184   708  11627.56  20332.38   

       S-P3      S-P4  
0   3121.92   6466.91  
1  19392.76  11222.62  
2   3224.90   8163.85  
3  17018.80  11921.36  
4  11837.28   5048.04  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  4600 non-null   int64  
 1   Date        4600 non-null   object 
 2   Q-P1        4600 non-null   int64  
 3   Q-P2        4600 non-n

In [8]:
# (b) Unstructured dataset — Customer Support Tickets

from google.colab import files
import pandas as pd
from textblob import TextBlob

# Upload Customer Support Tickets dataset (customer_support_tickets.csv)
uploaded = files.upload()

tickets_df = pd.read_csv("customer_support_tickets.csv")

print("Tickets dataset sample:")
print(tickets_df.head())

# Use 'Ticket Description' column instead of 'TicketText'
tickets_df["sentiment_score"] = tickets_df["Ticket Description"].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

print("Average sentiment:", tickets_df["sentiment_score"].mean())
print("Most negative tickets:")
print(tickets_df.sort_values("sentiment_score").head(5)[["Ticket Description", "sentiment_score"]])


Saving customer_support_tickets.csv to customer_support_tickets (1).csv
Tickets dataset sample:
   Ticket ID        Customer Name              Customer Email  Customer Age  \
0          1        Marisa Obrien  carrollallison@example.com            32   
1          2         Jessica Rios    clarkeashley@example.com            42   
2          3  Christopher Robbins   gonzalestracy@example.com            48   
3          4     Christina Dillon    bradleyolson@example.org            27   
4          5    Alexander Carroll     bradleymark@example.com            67   

  Customer Gender Product Purchased Date of Purchase      Ticket Type  \
0           Other        GoPro Hero       2021-03-22  Technical issue   
1          Female       LG Smart TV       2021-05-22  Technical issue   
2           Other          Dell XPS       2020-07-14  Technical issue   
3          Female  Microsoft Office       2020-11-13  Billing inquiry   
4          Female  Autodesk AutoCAD       2020-02-04  Billing in

Exercise 2

In [9]:
import pandas as pd
import re
from google.colab import files

# Upload Tweets dataset (Tweets.csv)
uploaded = files.upload()

tweets_df = pd.read_csv("Tweets.csv")

print("Tweets dataset sample:")
print(tweets_df.head())


Saving Tweets.csv to Tweets.csv
Tweets dataset sample:
       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment  
0  I`d have responded, if I were going   neutral  
1                             Sooo SAD  negative  
2                          bullying me  negative  
3                       leave me alone  negative  
4                        Sons of ****,  negative  


In [13]:
# Extract hashtags
tweets_df["hashtags"] = tweets_df["text"].apply(lambda x: re.findall(r"#\\w+", str(x)))

# Extract mentions
tweets_df["mentions"] = tweets_df["text"].apply(lambda x: re.findall(r"@\\w+", str(x)))

# Keep structured version
structured_tweets = tweets_df[["text", "sentiment", "hashtags", "mentions"]]

print("Structured tweets sample:")
print(structured_tweets.head())


Structured tweets sample:
                                                text sentiment hashtags  \
0                I`d have responded, if I were going   neutral       []   
1      Sooo SAD I will miss you here in San Diego!!!  negative       []   
2                          my boss is bullying me...  negative       []   
3                     what interview! leave me alone  negative       []   
4   Sons of ****, why couldn`t they put them on t...  negative       []   

  mentions  
0       []  
1       []  
2       []  
3       []  
4       []  


In [16]:
# Most common hashtags
all_hashtags = sum(structured_tweets["hashtags"], [])
hashtags_series = pd.Series(all_hashtags)
print("Top 5 hashtags:")
print(hashtags_series.value_counts().head(5))

# Map sentiment labels to numeric values for analysis
sentiment_map = {"negative": -1, "neutral": 0, "positive": 1}
structured_tweets["sentiment_numeric"] = structured_tweets["sentiment"].map(sentiment_map)

# Compare sentiment
print("Average sentiment (with hashtags):", structured_tweets[structured_tweets["hashtags"].str.len() > 0]["sentiment_numeric"].mean())
print("Average sentiment (without hashtags):", structured_tweets[structured_tweets["hashtags"].str.len() == 0]["sentiment_numeric"].mean())


# Mapper les sentiments sur des valeurs numériques
sentiment_map = {"negative": -1, "neutral": 0, "positive": 1}
structured_tweets.loc[:, "sentiment_numeric"] = structured_tweets["sentiment"].map(sentiment_map)

# Extraire tous les hashtags
all_hashtags = sum(structured_tweets["hashtags"], [])
hashtags_series = pd.Series(all_hashtags)

print("Top 5 hashtags:")
if hashtags_series.empty:
    print("⚠️ Aucun hashtag trouvé dans le dataset.")
else:
    print(hashtags_series.value_counts().head(5))

# Comparer la moyenne des sentiments
with_hashtags = structured_tweets[structured_tweets["hashtags"].str.len() > 0]["sentiment_numeric"].mean()
without_hashtags = structured_tweets[structured_tweets["hashtags"].str.len() == 0]["sentiment_numeric"].mean()

print("Average sentiment (with hashtags):", with_hashtags if pd.notna(with_hashtags) else "N/A")
print("Average sentiment (without hashtags):", without_hashtags if pd.notna(without_hashtags) else "N/A")


Top 5 hashtags:
Series([], Name: count, dtype: int64)
Average sentiment (with hashtags): nan
Average sentiment (without hashtags): 0.029147410938466577
Top 5 hashtags:
⚠️ Aucun hashtag trouvé dans le dataset.
Average sentiment (with hashtags): N/A
Average sentiment (without hashtags): 0.029147410938466577


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  structured_tweets["sentiment_numeric"] = structured_tweets["sentiment"].map(sentiment_map)
