# Analyse des brevets de Wipo

In [None]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, to_date, year
from pyspark.sql.functions import split, array_distinct, concat_ws


spark = SparkSession.builder \
    .appName("wipo_patent") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
    .getOrCreate()

mongo_ip = "mongodb://127.0.0.1:27017/"
db_name = "big_data_project" 
collection_name = "wipo_nv"  

df = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
    .option("uri", "mongodb://127.0.0.1:27017/big_data_project.wipo_nv") \
    .load()


df.printSchema()


In [None]:
df.createOrReplaceTempView("patents_data")  
spark.sql("SELECT DISTINCT Publication_Kind FROM patents_data").show()

# Nombre de brevets par bureau

In [None]:
from pyspark.sql import functions as F

df.createOrReplaceTempView("patents_data")  
office_counts = spark.sql("""
    SELECT Office, COUNT(*) AS Occurrences
    FROM patents_data
    WHERE Office IS NOT NULL AND Office != 'NaN' 
    GROUP BY Office
    ORDER BY Occurrences DESC
""")

office_counts.show()

In [None]:
df_pd = office_counts.toPandas()

df_pd_top = df_pd.head(10)

plt.figure(figsize=(10, 6))
plt.barh(df_pd_top["Office"], df_pd_top["Occurrences"], color='skyblue')
plt.xlabel('Occurrences')
plt.ylabel('Office')
plt.title('Top 10 Offices by Occurrences')
plt.gca().invert_yaxis()  
plt.show()

# Nombre de brevets par type de publication

In [None]:
patents_by_publication_kind = df.groupBy("Publication_Kind").count().orderBy("count", ascending=False)
patents_by_publication_kind.show()


# Nombre de brevets par demandeur

In [None]:
patents_by_applicant = df.groupBy("Applicants").count().orderBy("count", ascending=False)
patents_by_applicant.show()


# Nombre de brevets par agent

In [None]:
patents_by_agent = df.groupBy("Agents").count().orderBy("count", ascending=False)
patents_by_agent.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import pyspark.sql.functions as F

result = spark.sql("""
    SELECT 
        CASE 
            WHEN Publication_Kind LIKE 'A%' THEN 'A%'
            WHEN Publication_Kind LIKE 'B%' THEN 'B%'
            WHEN Publication_Kind LIKE 'C%' THEN 'C%'
            ELSE 'NAN'
        END AS Publication_Group,
        COUNT(*) AS Count
    FROM patents_data 
    GROUP BY 
        CASE 
            WHEN Publication_Kind LIKE 'A%' THEN 'A%'
            WHEN Publication_Kind LIKE 'B%' THEN 'B%'
            WHEN Publication_Kind LIKE 'C%' THEN 'C%'
            ELSE 'NAN'
        END
""")

result.show()



In [None]:
df_P = result.toPandas()
colors = {'A%': 'lightblue', 'B%': 'lightgreen', 'C%': 'lightcoral', 'NAN': 'lightgray'}

plt.pie(df_P["Count"], labels=None, colors=[colors[group] for group in df_P["Publication_Group"]], autopct='%1.1f%%', startangle=140)

plt.legend(labels=['Second publication level', 'Third publication level', 'NAN', 'First publication level'], loc='upper right', bbox_to_anchor=(1.25, 1))

plt.title("Number of Patents in Each Level, in percentages")

plt.text(-1.5, 1.5, "Levels A, B, and C belong to Group 1: Used for documents resulting from a patent application and being identified as the primary or major series.", fontsize=10, wrap=True)

plt.axis('equal')  
plt.show()

## Analyse des Inventeurs les Plus Fréquents 

In [None]:

from pyspark.sql import functions as F


inventor_counts = spark.sql("""
    SELECT Inventors, COUNT(*) AS Occurrences
    FROM patents_data
    WHERE Inventors IS NOT NULL AND Inventors != 'NaN'  -- Exclure les valeurs NaN
    GROUP BY Inventors
    ORDER BY Occurrences DESC
""")

inventor_counts.show(5)


In [26]:
df_pd = df.toPandas()

In [27]:
df_pd['English_Text'] = df_pd['English_Text'].str.replace(r'\(EN\)', '', regex=True)

df_pd['English_Text'] = df_pd['English_Text'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

In [28]:
df_pd['Abstract_english'] = df_pd['Abstract_english'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

In [29]:
df_pd['Abstract_english'] = df_pd['Abstract_english'].str.lower()
df_pd['English_Text'] = df_pd['English_Text'].str.lower()

In [None]:
df_pd.head(2)

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


In [53]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

def process_abstract(abstract):
    tokens = word_tokenize(abstract)
    tokens = [word for word in tokens if word not in string.punctuation]
    stop_words_en = set(stopwords.words('english'))
    stop_words_fr = set(stopwords.words('french'))
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words_en and word.lower() not in stop_words_fr]
    return tokens

df_pd['abstract_tokens'] = df_pd['Abstract_english'].apply(process_abstract)
df_pd['Title_tokens'] = df_pd['English_Text'].apply(process_abstract)

In [51]:
df_pd = df_pd.drop('abstract_tokens', axis=1)
df_pd = df_pd.drop('Title_tokens', axis=1)

In [None]:
df_pd.head(2)

In [55]:
from collections import Counter

all_words_abs = [word for sublist in df_pd["abstract_tokens"] for word in sublist]
all_words_tit = [word for sublist in df_pd["Title_tokens"] for word in sublist]
abstract_word_counts = Counter(all_words_abs)
title_word_counts = Counter(all_words_tit)


In [None]:
print(abstract_word_counts.most_common(10))
print(title_word_counts.most_common(10))

## Word cloud des mots dans les titres

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(title_word_counts)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


## Word cloud des mots dans les résumés

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(abstract_word_counts)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


## Regrouper les mots des résumés en clusters selon les années d'apparition

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd

df_pd['Publication_Date'] = pd.to_datetime(df_pd['Publication_Date'])

df_pd['abstract_text'] = df_pd['abstract_tokens'].apply(lambda tokens: ' '.join(tokens))

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_pd['abstract_text'])

kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X)

df_pd['Cluster_label'] = kmeans.labels_


In [None]:
clusters_by_year = df_pd.groupby([df_pd['Publication_Date'].dt.year, 'Cluster_label']).size()
clusters_by_year = clusters_by_year.reset_index(name='Count')

plt.figure(figsize=(10, 6))
for cluster_label in clusters_by_year['Cluster_label'].unique():
    cluster_data = clusters_by_year[clusters_by_year['Cluster_label'] == cluster_label]
    plt.bar(cluster_data['Publication_Date'], cluster_data['Count'], label=f'Cluster {cluster_label}')

plt.xlabel('Année')
plt.ylabel('Nombre d\'occurrences')
plt.title('Occurrences de clusters par année')
plt.legend()
plt.show()


In [None]:
tokens_by_cluster = df_pd.groupby('Cluster_label')['abstract_text'].apply(lambda x: ' '.join(x))

for cluster_label, text in tokens_by_cluster.items():
    wordcloud = WordCloud(background_color='white').generate(text)
    plt.figure(figsize=(8, 4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud pour Cluster {cluster_label}')
    plt.axis('off')
    plt.show()

## Regrouper les mots des titres en clusters selon les années d'apparition

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd

df_pd['Publication_Date'] = pd.to_datetime(df_pd['Publication_Date'])

df_pd['title_text'] = df_pd['Title_tokens'].apply(lambda tokens: ' '.join(tokens))

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_pd['title_text'])

kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X)

df_pd['Cluster_label_title'] = kmeans.labels_


In [None]:

clusters_by_year = df_pd.groupby([df_pd['Publication_Date'].dt.year, 'Cluster_label_title']).size()

clusters_by_year = clusters_by_year.reset_index(name='Count')

plt.figure(figsize=(10, 6))
for cluster_label in clusters_by_year['Cluster_label_title'].unique():
    cluster_data = clusters_by_year[clusters_by_year['Cluster_label_title'] == cluster_label]
    plt.bar(cluster_data['Publication_Date'], cluster_data['Count'], label=f'Cluster {cluster_label}')

plt.xlabel('Année')
plt.ylabel('Nombre d\'occurrences')
plt.title('Occurrences de clusters par année')
plt.legend()
plt.show()


In [None]:
tokens_by_cluster = df_pd.groupby('Cluster_label_title')['title_text'].apply(lambda x: ' '.join(x))

for cluster_label, text in tokens_by_cluster.items():
    wordcloud = WordCloud(background_color='white').generate(text)
    plt.figure(figsize=(8, 4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud pour Cluster {cluster_label}')
    plt.axis('off')
    plt.show()