In [1]:
# !pip uninstall gensim
# !pip uninstall scipy
# !pip install scipy==1.10.1
# !pip install gensim==4.3.2


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
comprehensive_analysis_file = "comprehensive_analysis.csv"
image_details_df = pd.read_csv(comprehensive_analysis_file)
image_details_df.columns

Index(['Timestamp', 'Detected Objects', 'video_id', 'video_url',
       'video_title'],
      dtype='object')

In [4]:
image_details_df.head()

Unnamed: 0,Timestamp,Detected Objects,video_id,video_url,video_title
0,0.0,The image you've provided appears to be a low...,neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!
1,66.9208,The image shows a nighttime scene with the fo...,neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!
2,133.8416,"The image shows a close-up of a modern, compa...",neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!
3,200.7624,The image shows a close-up view of a 3D print...,neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!
4,267.6832,"In the image, there is a person who appears t...",neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!


In [5]:
NUM_TOPICS = 11

## TF-IDF

In [6]:
# Preprocessing: TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(image_details_df["Detected Objects"])
tfidf_features = tfidf_vectorizer.get_feature_names_out()

# Function to extract top words per topic
def get_top_words(model, feature_names, n_top_words=10):
    return [
        [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        for topic in model.components_
    ]

## NMF Percentage Topic Modeling

In [7]:
nmf_model = NMF(n_components=NUM_TOPICS, random_state=15)
nmf_topics = nmf_model.fit_transform(tfidf_matrix)
nmf_top_words = get_top_words(nmf_model, tfidf_features)

# Print top words for NMF
print("Top Words for NMF Topics:")
for i, words in enumerate(nmf_top_words):
    print(f"Topic {i+1}: {', '.join(words)}")

# Create a DataFrame for NMF
nmf_df = pd.DataFrame(nmf_topics, columns=[f"Topic {i+1}" for i in range(NUM_TOPICS)])
nmf_df.iloc[:, -NUM_TOPICS:] = normalize(nmf_df.iloc[:, -NUM_TOPICS:], norm='l1', axis=1)
nmf_df = pd.concat([image_details_df, nmf_df], axis=1)

Top Words for NMF Topics:
Topic 1: photo, person, image, individual, taken, wearing, camera, different, lighting, sky
Topic 2: desk, person, room, wall, sitting, monitor, individual, items, office, plant
Topic 3: components, small, tools, electronic, repair, pair, parts, electronics, screwdriver, screws
Topic 4: laptop, keyboard, screen, open, desk, visible, computer, laptops, apple, image
Topic 5: interface, text, screen, image, icon, video, icons, corner, content, user
Topic 6: man, wearing, shirt, hair, beard, image, dark, short, microphone, table
Topic 7: watch, smartwatch, strap, apple, smartwatches, face, display, watches, digital, black
Topic 8: tablet, screen, person, game, holding, hands, image, interacting, blurred, background
Topic 9: phone, smartphone, camera, case, screen, holding, image, visible, person, hand
Topic 10: device, object, hand, image, white, holding, small, person, electronic, surface
Topic 11: woman, mountains, wall, hair, image, landscape, decorative, clear

In [8]:
# Topic 1: Portraits and Outdoor Photography
# Topic 2: Office Spaces and Desk Items
# Topic 3: Electronics and Repair Tools
# Topic 4: Laptops and Workstation Setups
# Topic 5: User Interfaces and Digital Screens
# Topic 6: Men with Accessories and Props
# Topic 7: Smartwatches and Digital Devices
# Topic 8: Tablet Usage and Gaming Scenes
# Topic 9: Smartphones and Device Handling
# Topic 10: Handheld Electronics and Gadgets
# Topic 11: Landscape Art and Decor Themes

In [9]:
# Define the shortened topic names
short_topic_names = [
    "i_Portraits_Outdoor",
    "i_Office_Desk_Items",
    "i_Electronics_Repair",
    "i_Laptops_Workstations",
    "i_UI_Digital_Screens",
    "i_Men_Accessories",
    "i_Smartwatches_Devices",
    "i_Tablet_Gaming",
    "i_Smartphones_Handling",
    "i_Handheld_Gadgets",
    "i_Landscape_Decor"
]

# Adjust the list length to match the number of topics
short_topic_names = short_topic_names[:len(nmf_df.columns) - len(image_details_df.columns)]

# Map the new column names for topics
new_column_names = {
    f"Topic {i+1}": short_topic_names[i]
    for i in range(len(short_topic_names))
}

# Create nmf_df with original data and topic contributions
nmf_df = pd.DataFrame(nmf_topics, columns=[f"Topic {i+1}" for i in range(len(nmf_topics[0]))])
nmf_df = pd.concat([image_details_df, nmf_df], axis=1)

# Rename topic columns with the new names
nmf_df.rename(columns=new_column_names, inplace=True)

nmf_df.head()

Unnamed: 0,Timestamp,Detected Objects,video_id,video_url,video_title,i_Portraits_Outdoor,i_Office_Desk_Items,i_Electronics_Repair,i_Laptops_Workstations,i_UI_Digital_Screens,i_Men_Accessories,i_Smartwatches_Devices,i_Tablet_Gaming,i_Smartphones_Handling,i_Handheld_Gadgets,i_Landscape_Decor
0,0.0,The image you've provided appears to be a low...,neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,0.347895,0.0,0.0,0.0,0.408451,0.010373,0.0,0.0,0.0,0.23328,0.0
1,66.9208,The image shows a nighttime scene with the fo...,neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,0.317538,0.058515,0.0,0.0,0.052363,0.025637,0.0,0.0,0.0,0.398638,0.147308
2,133.8416,"The image shows a close-up of a modern, compa...",neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,0.0,0.0,0.058867,0.0,0.13275,0.015685,0.045901,0.013202,0.010486,0.723109,0.0
3,200.7624,The image shows a close-up view of a 3D print...,neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,0.092078,0.0,0.619122,0.0,0.089932,0.0,0.0,0.0,0.0,0.198867,0.0
4,267.6832,"In the image, there is a person who appears t...",neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,0.029465,0.259587,0.156904,0.0,0.0,0.106407,0.012146,0.014042,0.02655,0.311555,0.083345


In [10]:
nmf_df.columns

Index(['Timestamp', 'Detected Objects', 'video_id', 'video_url', 'video_title',
       'i_Portraits_Outdoor', 'i_Office_Desk_Items', 'i_Electronics_Repair',
       'i_Laptops_Workstations', 'i_UI_Digital_Screens', 'i_Men_Accessories',
       'i_Smartwatches_Devices', 'i_Tablet_Gaming', 'i_Smartphones_Handling',
       'i_Handheld_Gadgets', 'i_Landscape_Decor'],
      dtype='object')

In [11]:
# Identify topic columns that start with 'i_'
topic_columns = [col for col in nmf_df.columns if col.startswith('i_')]

# Group by `video_id` and calculate the mean for topic columns
grouped_topic_means = nmf_df.groupby('video_id', as_index=False)[topic_columns].mean()

# Merge the grouped means back into a single DataFrame
grouped_df = pd.merge(
    nmf_df[['video_id']].drop_duplicates(),  # Retain unique `video_id`
    grouped_topic_means,
    on='video_id',
    how='inner'
)

# Output the grouped DataFrame
print("Grouped DataFrame with Mean Values for `i_*` Columns by `video_id`:")
grouped_df.head()


Grouped DataFrame with Mean Values for `i_*` Columns by `video_id`:


Unnamed: 0,video_id,i_Portraits_Outdoor,i_Office_Desk_Items,i_Electronics_Repair,i_Laptops_Workstations,i_UI_Digital_Screens,i_Men_Accessories,i_Smartwatches_Devices,i_Tablet_Gaming,i_Smartphones_Handling,i_Handheld_Gadgets,i_Landscape_Decor
0,neIYdLysqlk,0.144373,0.111729,0.049618,0.007595,0.074713,0.110112,0.009071,0.029046,0.087018,0.318479,0.058245
1,YX8ks42Azn8,0.175335,0.09988,0.040972,0.01055,0.099863,0.051743,0.019642,0.084691,0.146995,0.250905,0.019425
2,4RcThoRG46c,0.140951,0.194708,0.034888,0.043597,0.178386,0.132647,0.009894,0.041346,0.025141,0.166527,0.031916
3,vSIbvJB4WdI,0.327336,0.044627,0.013596,0.006036,0.240204,0.122107,0.01607,0.020679,0.123044,0.068881,0.01742
4,cRPBp2tRxFY,0.195492,0.107757,0.014325,0.030166,0.120749,0.097365,0.013171,0.024163,0.245295,0.113471,0.038047


In [12]:
output_filename = "image_topic_analysis.csv"
grouped_df.to_csv(output_filename, index=False)

print(f"Grouped DataFrame saved to {output_filename}")

Grouped DataFrame saved to image_topic_analysis.csv


## LSA Topic Modeling

In [13]:
lsa_model = TruncatedSVD(n_components=NUM_TOPICS, random_state=15)
lsa_topics = lsa_model.fit_transform(tfidf_matrix)
lsa_top_words = get_top_words(lsa_model, tfidf_features)

# Print top words for LSA
print("\nTop Words for LSA Topics:")
for i, words in enumerate(lsa_top_words):
    print(f"Topic {i+1}: {', '.join(words)}")


Top Words for LSA Topics:
Topic 1: image, person, screen, desk, visible, appears, smartphone, phone, laptop, holding
Topic 2: desk, laptop, room, wall, sitting, man, monitor, hair, gesturing, woman
Topic 3: small, laptop, electronic, components, items, pair, desk, object, tools, repair
Topic 4: laptop, keyboard, screen, interface, computer, open, displaying, icons, laptops, apple
Topic 5: watch, text, product, box, right, related, black, audio, electronic, smartwatch
Topic 6: man, tablet, laptop, device, photo, electronic, wearing, hand, watch, shirt
Topic 7: watch, man, apple, case, laptop, smartwatch, white, surface, strap, phones
Topic 8: watch, tablet, person, smartwatch, strap, displaying, game, holding, hand, smartwatches
Topic 9: man, phone, smartphone, screen, interface, microphone, desk, audio, recording, watch
Topic 10: tablet, man, white, surface, object, objects, interface, phones, blue, red
Topic 11: woman, watch, tablet, case, man, components, tools, table, video, sky


In [14]:
lsa_df = pd.DataFrame(lsa_topics, columns=[f"Topic {i+1}" for i in range(NUM_TOPICS)])
lsa_df.iloc[:, -NUM_TOPICS:] = normalize(lsa_df.iloc[:, -NUM_TOPICS:], norm='l1', axis=1)
lsa_df = pd.concat([image_details_df, lsa_df], axis=1)
lsa_df.head()

Unnamed: 0,Timestamp,Detected Objects,video_id,video_url,video_title,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11
0,0.0,The image you've provided appears to be a low...,neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,0.220318,-0.045002,-0.160487,0.022966,0.090568,0.021247,-0.083095,-0.079938,-0.160435,0.052273,-0.063671
1,66.9208,The image shows a nighttime scene with the fo...,neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,0.30351,-0.008064,-0.101057,-0.052987,0.025382,0.032073,0.027251,-0.059061,-0.256305,0.104576,-0.029734
2,133.8416,"The image shows a close-up of a modern, compa...",neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,0.294855,-0.209225,0.030704,-0.009988,0.087372,0.172066,-0.064164,0.004682,0.005896,0.021896,-0.099152
3,200.7624,The image shows a close-up view of a 3D print...,neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,0.191586,-0.150336,0.134684,-0.031411,0.106068,0.072516,-0.077556,-0.111097,-0.089385,0.011042,0.024318
4,267.6832,"In the image, there is a person who appears t...",neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,0.408516,0.033424,0.123051,-0.187555,0.030121,0.02876,0.026585,0.036449,-0.075628,0.040821,-0.00909


## LDA for Topics

In [15]:
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(image_details_df["Detected Objects"])
count_features = count_vectorizer.get_feature_names_out()

In [16]:
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, random_state=15)
lda_topics = lda_model.fit_transform(count_matrix)

# Normalize LDA contributions (optional: interpret as percentage contribution)
lda_topics_normalized = normalize(lda_topics, norm='l1', axis=1)

# Extract Top Words for Each Topic
def get_top_words(model, feature_names, n_top_words=10):
    return [
        [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        for topic in model.components_
    ]

lda_top_words = get_top_words(lda_model, count_features)

# Print top words for each LDA topic
print("Top Words for LDA Topics:")
for i, words in enumerate(lda_top_words):
    print(f"Topic {i+1}: {', '.join(words)}")

Top Words for LDA Topics:
Topic 1: image, person, appears, visible, desk, video, right, background, left, screen
Topic 2: image, laptop, screen, text, appears, visible, background, right, possibly, phone
Topic 3: image, watch, text, resolution, design, visible, appears, graphic, background, features
Topic 4: game, icon, gaming, icons, interface, bar, shows, screen, related, user
Topic 5: image, person, background, appears, smartphone, holding, visible, indoor, phone, setting
Topic 6: performance, core, image, cpu, score, numerical, columns, speed, chart, graph
Topic 7: interface, image, screen, icons, settings, user, various, window, menu, screenshot
Topic 8: image, person, device, small, appears, electronic, holding, smartphone, visible, background
Topic 9: pro, text, image, max, device, product, 16, reads, comparison, overlay
Topic 10: image, background, person, visible, holding, appears, smartphone, hand, phone, white
Topic 11: keyboard, desk, person, laptop, controller, gaming, com

In [17]:
lda_df = pd.DataFrame(lda_topics_normalized, columns=[f"Topic {i+1}" for i in range(NUM_TOPICS)])
lda_df = pd.concat([image_details_df, lda_df], axis=1)
lda_df.head()

Unnamed: 0,Timestamp,Detected Objects,video_id,video_url,video_title,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11
0,0.0,The image you've provided appears to be a low...,neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,0.754489,0.002457,0.223396,0.002457,0.002457,0.002457,0.002457,0.002457,0.002457,0.002457,0.002457
1,66.9208,The image shows a nighttime scene with the fo...,neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,0.857666,0.000957,0.133721,0.000957,0.000957,0.000957,0.000957,0.000957,0.000957,0.000957,0.000957
2,133.8416,"The image shows a close-up of a modern, compa...",neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,0.002598,0.558986,0.002598,0.002597,0.002598,0.002597,0.002598,0.417635,0.002597,0.002598,0.002598
3,200.7624,The image shows a close-up view of a 3D print...,neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,0.000664,0.000664,0.000664,0.035835,0.000664,0.000664,0.000664,0.311303,0.000664,0.647553,0.000664
4,267.6832,"In the image, there is a person who appears t...",neIYdLysqlk,https://www.youtube.com/watch?v=neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,0.001568,0.001568,0.001567,0.001567,0.548994,0.001567,0.001567,0.436899,0.001567,0.001568,0.001568
