# Patent Analysis : ESPACENET
***

### Basic Analysis 
- *Count the nomber of unique applicants* 
- *Count the number of unique inventors*
- *Number of application per year*
- *Distribution of patents per country*
### Temporal Analysis
- *Observe the patent Distribution over publication date*
- *Observe the patent Distribution over application date*
### Geographical Analysis
- *distribution of applicants and inventors around the world*
- *Collaboration network between the inventors*
### Text Analysis : NLP
- *Extract the language from the abstract*
- *Analyse the abstract (extract key words)*
- *Cluster patents into groups based on their abstract similarities.*

## 0- Libraries

In [64]:

import pyspark
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col, to_date, year, split, array_distinct, concat_ws, explode, lit


import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist


from langdetect import detect
from collections import Counter
from translate import Translator

import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

## 1 - Importing and Preprocessing the data from mongoDB

In [None]:
print(pyspark.__version__)

In [None]:
spark = SparkSession.builder \
    .appName("PatentAnalysis") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
    .getOrCreate()

In [None]:

mongo_ip = "mongodb://127.0.0.1:27017/"
db_name = "espacenet_patent" 
collection_name = "espacenet_data"  

df = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
    .option("uri", mongo_ip + db_name + "." + collection_name) \
    .load()

df.printSchema()

### II - Preprocessing the data

In [6]:

df = df.withColumn("Date_publication", to_date(col("Date__publication"), "yyyy-MM-dd"))

In [7]:
df = df.withColumn("Date_application", to_date(col("Date_Application"), "yyyy-MM-dd"))

In [None]:

df_split = df.withColumn("Inventors_country_split", split(df["Inventors_country"], " & "))
df_cleaned = df_split.withColumn("Inventors_country_cleaned", array_distinct("Inventors_country_split"))
df_cleaned = df_cleaned.withColumn("Inventors_country", concat_ws(" & ", "Inventors_country_cleaned"))
df = df_cleaned.drop("Inventors_country_split", "Inventors_country_cleaned")
df.show()


In [None]:

df_split = df.withColumn("Applicants_country_split", split(df["Applicants_country"], " & "))
df_cleaned = df_split.withColumn("Applicants_country_cleaned", array_distinct("Applicants_country_split"))
df_cleaned = df_cleaned.withColumn("Applicants_country", concat_ws(" & ", "Applicants_country_cleaned"))
df = df_cleaned.drop("Applicants_country_split", "Applicants_country_cleaned")
df.show()

In [None]:
data = df.toPandas()

for column in data.columns:
    data[f'{column}'] = data[f'{column}'].replace({'': 'NULL', ' ': 'NULL'})

In [11]:
df= spark.createDataFrame(data)

In [None]:
df.describe().show()

In [13]:
df.createOrReplaceTempView("patent_data")

In [None]:
spark.sql("SELECT * FROM patent_data").show(50)

## 2 -  Basic Analysis

###  1. Count the nomber of unique applicants and Inventors 

In [None]:
table = spark.sql("select distinct(Applicants) from patent_data")
print(f"Nomber of unique Applicants {table.count()}")
table.show()

In [None]:
table = spark.sql("select distinct(Inventors) from patent_data")
print(f"Nomber of unique Inventors {table.count()}")
table.show()

### 2. Nombre of patent invented (applicated) by each inventor (Applicant)

In [None]:
table = spark.sql("select Applicants, count(Applicants) as nomber_patent from patent_data GROUP BY Applicants ORDER BY nomber_patent DESC")
table.show()

In [None]:
table = spark.sql("select Inventors, count(Inventors) as nomber_patent from patent_data GROUP BY Inventors ORDER BY nomber_patent DESC")
table.show()

### 3. Number of patent per country

In [None]:
table = spark.sql("select count(*) as number_patent, Inventors_country  from patent_data GROUP BY Inventors_country ORDER BY number_patent DESC")
table.show()

In [None]:
pd_table = table.toPandas()
plt.figure(figsize=(10, 4))
sns.barplot(x='Inventors_country', y='number_patent', data=pd_table, color='darkred')
plt.title('Number of patent by inventors country')
plt.xlabel('Inventors country')
plt.ylabel('number patent')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3 - Temporal Analysis

### 1. Patent distribution per year

In [None]:
df_year = df.withColumn("Date_publication", year(col("Date_publication")))
year_table = df_year.createOrReplaceTempView("year_table")
table = spark.sql("select count(*) as number_patent, Date_publication  from year_table GROUP BY Date_publication ORDER BY number_patent DESC")
table.show()

In [None]:
pd_table = table.toPandas()
plt.figure(figsize=(8, 4))
sns.lineplot(x='Date_publication', y='number_patent', data=pd_table, color='darkred')
plt.title('Number of Patents by Publication Date')
plt.xlabel('Publication Date')
plt.ylabel('Number of Patents')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 4 - Geographical Analysis

### 1. distribution of applicants and inventors around the world

In [None]:
df_applicants_country = df.withColumn("applicants_country", explode(split(col("applicants_country"), " & ")))
applicants_country_count = df_applicants_country.groupBy("applicants_country").count()
applicants_country_count.show()

In [None]:
pd_table = applicants_country_count.toPandas()

plt.figure(figsize=(8, 4))
sns.barplot(x='applicants_country', y='count', data=pd_table, color='darkred')
plt.title('Number of Applicants by country')
plt.xlabel('Applicants country')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df_inventors_country = df.withColumn("inventors_country", explode(split(col("inventors_country"), " & ")))
inventors_country_count = df_inventors_country.groupBy("inventors_country").count()
inventors_country_count.show()

In [None]:
pd_table = inventors_country_count.toPandas()

plt.figure(figsize=(8, 4))
sns.barplot(x='inventors_country', y='count', data=pd_table, color='darkred')
plt.title('Number of inventors by country')
plt.xlabel('inventors country')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### 2. Collaboration network between the inventors

In [None]:
df_collaborations = df.filter(col("Inventors_country").contains("&"))
collaborations_count = df_collaborations.groupBy("inventors_country").count()
collaborations_count.show()

In [None]:
df_collaborations = df.filter(col("Applicants_country").contains("&"))
collaborations_count = df_collaborations.groupBy("Applicants_country").count()
collaborations_count.show()

## 5 - Text Analysis

### 1. Extract Language from the abstract

In [29]:

pd_data = df.toPandas()

In [None]:
languages = {}
for index, abstract in enumerate(pd_data["Patent_abstract"]):

    detected_language = detect(abstract)
    languages[f"Abstract {index}"] = detected_language

language_counts = Counter(languages.values())

print("Language Counts: ")
for language, counts in language_counts.items():
    print(f"Language : {language} -> {counts}")

In [None]:
plt.figure(figsize=(10, 10))
plt.pie(list(language_counts.values()), labels=list(language_counts.keys()), autopct='%1.1f%%', startangle=140)
plt.title('Language Distribution of Patent Abstracts \n')
plt.axis('equal') 
plt.show()

### 2. Extract key words from the abstract

### 3. Translate the Patent Abstract to English

In [None]:
for index, abstract in enumerate(pd_data["Patent_abstract"]):
    detected_language = detect(abstract)

    if detected_language != 'en':
        
        translator= Translator(to_lang="English")
        translation = translator.translate(abstract)
   
        print(f"---------- Translated Abstract {index} ----------")
        print(translation.text)


### 3. Ploting the WordCloud

In [None]:

all_abstracts = ' '.join(pd_data["Patent_abstract"])
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='plasma').generate(all_abstracts)

plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Patent Abstracts')
plt.axis('off')
plt.show()


**Big Data With Spark**
****
*NoteBook* : Analyzing the patent's metadata scraped from espacenet using pySpark