In [9]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import time
import plotly.graph_objects as go
import plotly.express as px
from Levenshtein import distance

In [22]:
headers_desktop = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;Win64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}
headers_mobile = {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'}

In [23]:
# Read csv file
df = pd.read_csv('../EveryTraceOfFampayPeriod/famapp_unique_links.csv')
df

Unnamed: 0,Link
0,https://famapp.in/
1,https://famcard.me/
2,https://fampay.in/
3,https://fampay.in/about
4,https://fampay.in/blog
...,...
518,https://famapp.in/privacy
519,https://famapp.in/terms
520,https://famapp.in/contact
521,https://www.instagram.com/fam.india/


In [24]:
def get_metrics(url, headers):
    try:
        print(f'Checking site: {url}')
        start_time = time.time()

        # Send a request
        response = requests.get(url, headers=headers)

        elapsed_time = time.time() - start_time

        # Parse the HTML of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the title and description tags
        title_tag = soup.find('title')
        description_tag = soup.find('meta', {'name':'description'})

        # Find the viewport tag
        viewport_tag = soup.find('meta', {'name':'viewport'})

        print(f'Finished checking site: {url}, time: {elapsed_time}')

        # Return a dictionary with all metrics
        return {
            'url': url,
            'response_time': elapsed_time,
            'http_status': response.status_code,
            'title': title_tag.text if title_tag else None,
            'description': description_tag['content'] if description_tag else None, # type: ignore
            'is_mobile_friendly': viewport_tag is not None,
        }

    except Exception as e:
        print(f'Error checking site: {url}, error: {str(e)}')
        return None

In [25]:
# Define a function to check if a site is mobile friendly
def get_all_metrics(url):
    desktop_metrics = get_metrics(url, headers_desktop)
    mobile_metrics = get_metrics(url, headers_mobile)

    if desktop_metrics is None or mobile_metrics is None:
        return None

    # Combine the metrics from both versions
    combined_metrics = {
        'url': url,
        'desktop_response_time': desktop_metrics['response_time'],
        'mobile_response_time': mobile_metrics['response_time'],
        'desktop_http_status': desktop_metrics['http_status'],
        'mobile_http_status': mobile_metrics['http_status'],
        'desktop_title': desktop_metrics['title'],
        'mobile_title': mobile_metrics['title'],
        'desktop_description': desktop_metrics['description'],
        'mobile_description': mobile_metrics['description'],
        'is_mobile_friendly': desktop_metrics['is_mobile_friendly'] and mobile_metrics['is_mobile_friendly'],
    }

    print(f'Finished checking site: {url}')

    return combined_metrics

In [26]:
# Apply function to each url
df_metrics = pd.DataFrame([get_all_metrics(url) for url in df['Link'].dropna()])
df_metrics.to_csv('website_metrics.csv', index=False)

Checking site: https://famapp.in/
Finished checking site: https://famapp.in/, time: 0.22333002090454102
Checking site: https://famapp.in/
Finished checking site: https://famapp.in/, time: 0.12727069854736328
Finished checking site: https://famapp.in/
Checking site: https://famcard.me/
Finished checking site: https://famcard.me/, time: 0.2695047855377197
Checking site: https://famcard.me/
Finished checking site: https://famcard.me/, time: 0.2392745018005371
Finished checking site: https://famcard.me/
Checking site: https://fampay.in/
Finished checking site: https://fampay.in/, time: 0.2522594928741455
Checking site: https://fampay.in/
Finished checking site: https://fampay.in/, time: 0.24708819389343262
Finished checking site: https://fampay.in/
Checking site: https://fampay.in/about
Finished checking site: https://fampay.in/about, time: 0.2553226947784424
Checking site: https://fampay.in/about
Finished checking site: https://fampay.in/about, time: 0.2816956043243408
Finished checking s

In [2]:
df_metrics = pd.read_csv('website_metrics.csv')

In [3]:
fig1 = go.Figure()
fig1.add_trace(go.Box(y=df_metrics['desktop_response_time'], name='Desktop'))
fig1.add_trace(go.Box(y=df_metrics['mobile_response_time'], name='Mobile'))
fig1.update_layout(title='Response Time (Desktop vs Mobile)', xaxis_title='Platform', yaxis_title='Time (s)')
fig1.write_html("../../fam-report-site/public/Web-Analytics/response_time.html")
fig1.show()

In [14]:
fig2 = go.Figure()
fig2.add_trace(go.Violin(y=df_metrics['desktop_http_status'], box_visible=True, line_color='blue', name='Desktop'))
fig2.add_trace(go.Violin(y=df_metrics['mobile_http_status'], box_visible=True, line_color='orange', name='Mobile'))
fig2.update_layout(title='HTTP Status (Desktop vs Mobile)', xaxis_title='Platform', yaxis_title='HTTP Status Code')
fig2.write_image("../../fam-report-site/public/Web-Analytics/http_status.png")
fig2.show()

In [23]:
# Filter out success and redirection status codes
filtered_desktop_status = df_metrics[df_metrics['desktop_http_status'].apply(lambda x: x>=400)]['desktop_http_status']
filtered_mobile_status = df_metrics[df_metrics['mobile_http_status'].apply(lambda x: x>=400)]['mobile_http_status']

# Count the occurrences of each status code
desktop_status_counts = filtered_desktop_status.value_counts().reset_index()
mobile_status_counts = filtered_mobile_status.value_counts().reset_index()

# Merge both counts
merged_counts = pd.merge(desktop_status_counts, mobile_status_counts, how='outer', on='index').fillna(0)
merged_counts.columns = ['http_status', 'desktop', 'mobile']

# Add URL to the data frame
merged_counts['url'] = df_metrics['url']

# Create a stacked bar chart with hover data
fig2 = px.bar(merged_counts, x="http_status", y=["desktop", "mobile"], 
               title='HTTP Status (Desktop vs Mobile)', 
               labels={'http_status':'HTTP Status Code', 'value':'Count'}, 
               hover_data={'url':True},
               barmode='stack')

fig2.write_html("../../fam-report-site/public/Web-Analytics/http_status.html")
fig2.show()


In [20]:
fig1 = px.scatter(df_metrics, x="desktop_response_time", y="mobile_response_time", 
                  title='Response Time (Desktop vs Mobile)', labels={'desktop_response_time':'Desktop Response Time',
                                                                      'mobile_response_time':'Mobile Response Time'})
fig1.write_html("../../fam-report-site/public/Web-Analytics/response_time.html")
fig1.show()

In [21]:
# Plot 3: Donut Chart for Distribution of Mobile-friendly websites
fig3 = px.pie(df_metrics, names='is_mobile_friendly', title='Mobile-friendly Websites Distribution', hole=0.3)
fig3.write_html("../../fam-report-site/public/Web-Analytics/mobile_friendly.html")
fig3.show()

In [10]:
# Define a function to calculate Levenshtein distance ratio
def similarity_ratio(s1, s2):
    if isinstance(s1, str) and isinstance(s2, str):
        max_len = max(len(s1), len(s2))
        return (max_len - distance(s1, s2)) / max_len
    else:
        return 0

# Compute similarity between desktop and mobile versions
df_metrics['title_similarity'] = df_metrics.apply(lambda x: similarity_ratio(x['desktop_title'], x['mobile_title']), axis=1)
df_metrics['description_similarity'] = df_metrics.apply(lambda x: similarity_ratio(x['desktop_description'], x['mobile_description']), axis=1)

In [11]:
# Plot 4: Title Similarity
fig4 = px.histogram(df_metrics, x='title_similarity', nbins=50, title='Title Tag Similarity (Desktop vs Mobile)')
fig4.update_xaxes(title_text='Similarity Ratio')
fig4.update_yaxes(title_text='Count')
fig4.write_html("../../fam-report-site/public/Web-Analytics/title_similarity.html")
fig4.show()

In [28]:
# Ensure the response times are numeric
df_metrics['desktop_response_time'] = pd.to_numeric(df_metrics['desktop_response_time'], errors='coerce')
df_metrics['mobile_response_time'] = pd.to_numeric(df_metrics['mobile_response_time'], errors='coerce')

# Then compute the difference
df_metrics['response_time_difference'] = abs(df_metrics['desktop_response_time'] - df_metrics['mobile_response_time'])

In [31]:
# Observation 1: Variation in HTTP Status Codes
status_diff_df = df_metrics[df_metrics['desktop_http_status'] != df_metrics['mobile_http_status']]
status_diff_df

Unnamed: 0,url,desktop_response_time,mobile_response_time,desktop_http_status,mobile_http_status,desktop_title,mobile_title,desktop_description,mobile_description,is_mobile_friendly,title_similarity,description_similarity,response_time_difference
522,https://www.linkedin.com/company/famindia/,0.498417,0.951617,429,999,,,,,False,0.0,0.0,0.4532


In [34]:
# Observation 2: Title and Description Similarity
title_diff_df = df_metrics[df_metrics['desktop_title'] != df_metrics['mobile_title']]
description_diff_df = df_metrics[df_metrics['desktop_description'] != df_metrics['mobile_description']]
print(f'Websites with different title for desktop and mobile:\n{title_diff_df}')
print(f'Websites with different description for desktop and mobile:\n{description_diff_df}')
title_diff_df.to_csv('title_difference.csv', index=False)
description_diff_df.to_csv('description_difference.csv', index=False)

Websites with different title for desktop and mobile:
                                                   url  desktop_response_time  \
100  https://www.facebook.com/sharer/sharer.php?u=h...               0.431044   
101  https://twitter.com/intent/tweet?text=How%20to...               1.063523   
102                    https://facebook.com/fampay.in/               0.664518   
107  https://www.facebook.com/sharer/sharer.php?u=h...               0.338678   
108  https://twitter.com/intent/tweet?text=Discover...               1.009599   
..                                                 ...                    ...   
483  https://twitter.com/intent/tweet?text=How%20a%...               1.099900   
497  https://www.facebook.com/sharer/sharer.php?u=h...               0.359163   
498  https://twitter.com/intent/tweet?text=How%20I%...               1.187847   
521               https://www.instagram.com/fam.india/               0.695001   
522         https://www.linkedin.com/company/famindia/ 

In [37]:
# Observation 3: Mobile-Friendly Websites
non_mobile_friendly_df = df_metrics[df_metrics['is_mobile_friendly'] == False]
print(f'Websites that are not mobile-friendly:\n{non_mobile_friendly_df}')
non_mobile_friendly_df.to_csv('non_mobile_friendly_websites.csv', index=False)

Websites that are not mobile-friendly:
                                                   url  desktop_response_time  \
100  https://www.facebook.com/sharer/sharer.php?u=h...               0.431044   
102                    https://facebook.com/fampay.in/               0.664518   
107  https://www.facebook.com/sharer/sharer.php?u=h...               0.338678   
111  https://www.facebook.com/sharer/sharer.php?u=h...               0.333694   
115  https://www.facebook.com/sharer/sharer.php?u=h...               0.582654   
..                                                 ...                    ...   
473  https://www.facebook.com/sharer/sharer.php?u=h...               0.332674   
477  https://www.facebook.com/sharer/sharer.php?u=h...               0.565465   
482  https://www.facebook.com/sharer/sharer.php?u=h...               0.592070   
497  https://www.facebook.com/sharer/sharer.php?u=h...               0.359163   
522         https://www.linkedin.com/company/famindia/               0

In [36]:
# Observation 4: Error Status Codes
error_status_df = df_metrics[(df_metrics['desktop_http_status'] >= 400) | (df_metrics['mobile_http_status'] >= 400)]
print(f'Websites with error status codes:\n{error_status_df}')

Websites with error status codes:
                                                   url  desktop_response_time  \
144           https://www.linkedin.com/company/fampay/               0.731333   
220  https://www.linkedin.com/company/fampay/mycomp...               0.527399   
252  https://fampay.in/blog/famcard-your-first-debi...               0.317363   
522         https://www.linkedin.com/company/famindia/               0.498417   

     mobile_response_time  desktop_http_status  mobile_http_status  \
144              0.585905                  999                 999   
220              0.509985                  999                 999   
252              0.326149                  404                 404   
522              0.951617                  429                 999   

        desktop_title      mobile_title desktop_description  \
144               NaN               NaN                 NaN   
220               NaN               NaN                 NaN   
252  UnRead by FamPa